import gradio as gr import requests import os from PIL import Image from io import BytesIO from tqdm import tqdm import time import cv2 import numpy as np import webcolors import json import re from gradio_client import Client import ast import spaces from profanityfilter import ProfanityFilter import time import torch from diffusers import DiffusionPipeline, AutoencoderKL, DPMSolverMultistepScheduler # Set device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if device.type != 'cuda': raise ValueError("need to run on GPU") vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16) text2img_pipe = DiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", vae=vae, torch_dtype=torch.float16, variant="fp16", use_safetensors=True, ).to(device) text2img_pipe.scheduler = DPMSolverMultistepScheduler.from_config(text2img_pipe.scheduler.config) # Performance/memory optimizations text2img_pipe.enable_model_cpu_offload() text2img_pipe.unet.to(memory_format=torch.channels_last) # in-place operation # pipeline_img2img.enable_sequential_cpu_offload() text2img_pipe.enable_vae_tiling() text2img_pipe.enable_attention_slicing() text2img_pipe.load_lora_weights('sessex/tabi-0-LoRA') negative_prompt ="nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry" def prepare_image_for_watermark(image): # Create a new white background image background = Image.new('RGB', (1200, 1500), color='white') # Calculate the position to paste the image onto the white background x_offset = (1200 - image.width) // 2 y_offset = 48 # position from top # Paste the resized image onto the white background at the specified position background.paste(image, (x_offset, y_offset)) return background # Initialize the profanity filter pf = ProfanityFilter() def filter_inappropriate(input_text): # Filter out inappropriate words pf.censor_char = ' ' filtered_text = pf.censor(input_text) return filtered_text.strip() # find the closest color name to rgb value # def closest_color(rgb_color): # min_colors = {} # for key, name in webcolors.CSS3_HEX_TO_NAMES.items(): # r_c, g_c, b_c = webcolors.hex_to_rgb(key) # rd = (r_c - rgb_color[0]) ** 2 # gd = (g_c - rgb_color[1]) ** 2 # bd = (b_c - rgb_color[2]) ** 2 # min_colors[(rd + gd + bd)] = name # return min_colors[min(min_colors.keys())] # def get_dominant_colors(img_filepath): # # Load the image from file path # img_data = Image.open(img_filepath) # # Convert the image to a NumPy array # img = np.array(img_data) # # k-means clustering to create palette of most dominant n_colors # pixels = np.float32(img.reshape(-1, 3)) # n_colors = 2 # criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 200, .1) # flags = cv2.KMEANS_RANDOM_CENTERS # _, labels, palette = cv2.kmeans(pixels, n_colors, None, criteria, 10, flags) # # get names of dominant colors # dominant_colors = [] # for color in palette: # color_name = closest_color(color) # dominant_colors.append(color_name) # return dominant_colors def get_image_caption(image): client = Client("https://vikhyatk-moondream1.hf.space/") result = client.predict( image, # filepath in 'image' Image component "What colors and patterns appear in this photo?", api_name="/answer_question" ) # print(result) return result def get_image_keywords(image): # get img2text description caption = get_image_caption(image) # get colors # colors_list = get_dominant_colors(image) # colors = ", ".join(colors_list) return caption, "" from transformers import pipeline pipe = pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-beta", torch_dtype=torch.bfloat16, device_map="auto") def construct_prompt(image_caption, image_colors, user_input): agent_maker_sys = f""" You are a AI whose job is to help users create their own custom shoe image which will reflect the colors, characteristics, or aesthetics from an image described by users. In particular, you need to respond succintly and write a prompt for an image generation model. The response must include to the word "mm-tabi" which will trigger the style of shoe. The response should avoid any descriptions of man or woman and don't include any articles of clothing or accessories from Caption. The response should always start with "surreal photo of mm-tabi boot with split toe". The response should always end with "still life in the style of retrofuturism, unconventional, dreamy, fantasy, digital video distortion, lens aberration, highly detailed, hd, 8k". The response should only use one or two aspects from the Caption provided by the user that could easily be applied to a still life scene or characteristic of the shoe, like color or texture or an object. For example, if a user says, "Keywords: California dogs sunshine shopping /n Caption: The photo features a woman wearing a blue sweater with a red and white design. The sweater is a prominent feature in the image, and it is the main focus of the scene. The background is plain, with no other colors or patterns visible. The woman is standing in front of a building, which serves as a backdrop for the photo." , provide immediately an image prompt that describes a still life photo of a shoe corresponding to the keywords, color, and objects or stylistic elements from the caption provided. Immediately STOP after that. It should be in this format: "surreal photo of mm-tabi boot with split toe, surrounded by California dogs sunshine shopping, still life in the style of retrofuturism, unconventional, dreamy, fantasy, digital video distortion, lens aberration, highly detailed, hd, 8k, blue" If a user says, "Keywords: Bunny /n Caption: The photo features a young woman wearing a black sweatshirt with a red and white pattern. She is standing in a large, empty room, which appears to be a mall or a similar public space. The room has a white ceiling and is decorated with various colors and patterns, creating a visually interesting environment. There is also a handbag visible in the scene, placed close to the woman." , provide immediately an image prompt that describes a still life photo of a shoe corresponding to the keywords, color, and objects or stylistic elements from the caption provided. Immediately STOP after that. It should be in this format: "surreal photo of mm-tabi boot with split toe, surrounded by bunny, still life in the style of retrofuturism, unconventional, dreamy, fantasy, digital video distortion, lens aberration, highly detailed, hd, 8k, red, white" If a user says, "Keywords: Sun /n Caption: The photo features a woman wearing a striped shirt, which has a combination of black, white, and gray colors. She is also wearing glasses, and her smile adds a positive touch to the image. Additionally, she is holding a cell phone in her hand, which is being photographed. The background of the photo is plain, with no visible patterns or colors, allowing the focus to be on the woman and her attire." , provide immediately an image prompt that describes a still life photo of a shoe corresponding to the keywords, color, and objects or stylistic elements from the caption provided. Immediately STOP after that. It should be in this format: "surreal photo of mm-tabi boot with split toe, surrounded by sun, still life in the style of retrofuturism, unconventional, dreamy, fantasy, digital video distortion, lens aberration, highly detailed, hd, 8k" Here's another example, if a user says, "Keywords: Chitose Abe /n Caption: The photo features a young man wearing a black jacket and a white baseball cap. He is smiling and posing for the camera, with a tan and black jacket and a white cap. The man is carrying a bag, which is visible in the image. The background of the photo is white, and there is a person standing behind the young man, possibly a friend or a passerby." , provide immediately an image prompt that describes a still life photo of a shoe corresponding to the keywords, color, and objects or stylistic elements from the caption provided. Immediately STOP after that. It should be in this format: "surreal photo of mm-tabi boot with split toe, surrounded by Chitose-Abe inspired, still life in the style of retrofuturism, unconventional, dreamy, fantasy, digital video distortion, lens aberration, highly detailed, hd, 8k, black, white" Here's another example, if a user says, "Keywords: Painted /n Caption: The photo features a young woman with black hair, wearing a black shirt. She is holding a cupcake with green frosting and red candy on top. The cupcake itself has a green frosting and red candy, which adds a pop of color to the scene. The overall image showcases a combination of black, green, and red colors, along with the woman’s smiling expression, creating a visually appealing and vibrant scene." , provide immediately an image prompt that describes a still life photo of a shoe corresponding to the keywords, color, and objects or stylistic elements from the caption provided. Immediately STOP after that. It should be in this format: "surreal photo of mm-tabi boot with split toe, painted, surrounded by cupcake, still life in the style of retrofuturism, unconventional, dreamy, fantasy, digital video distortion, lens aberration, highly detailed, hd, 8k, green, red" Here's another example, if a user says, "Keywords: Sneaker /n Caption: The photo features a woman wearing a black top with a white bottom. The top has a black collar, and the woman is posing in front of a large building. The building's interior is decorated with white tiles, which create a contrasting background for the woman's outfit." , provide immediately an image prompt that describes a still life photo of a shoe corresponding to the keywords, color, and objects or stylistic elements from the caption provided. Immediately STOP after that. It should be in this format: "surreal photo of mm-tabi sneaker with split toe, still life in the style of retrofuturism, unconventional, dreamy, fantasy, digital video distortion, lens aberration, highly detailed, hd, 8k, black" Here's another example, if a user says, "Keywords: Ruby black /n Caption: The photo features a woman wearing a dark blue sweater with a black and white checkered pattern. She is posing in front of a large, white building, which could be a hotel or a mall. The background is plain, with no visible patterns or colors. The woman is holding a handbag, which is also black and white in design." , provide immediately an image prompt that describes a still life photo of a shoe corresponding to the keywords, color, and objects or stylistic elements from the caption provided. Immediately STOP after that. It should be in this format: "surreal photo of mm-tabi boot with split toe, dark blue, surrounded by ruby black, still life in the style of retrofuturism, unconventional, dreamy, fantasy, digital video distortion, lens aberration, highly detailed, hd, 8k""" instruction = f""" <|system|> {agent_maker_sys} <|user|> """ prompt = f"{instruction.strip()}\n Keywords: {user_input} \n Caption: {image_caption}" print(f"INPUTS: \n Keywords: {user_input} \n Caption: {image_caption}") with torch.no_grad(): outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95) pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>' cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL) return cleaned_text.lstrip("\n") def text2img_inference(prompt): gr.Info('Image generation request sent') with torch.no_grad(): image = text2img_pipe( prompt=prompt, negative_prompt=negative_prompt, width=1024, height=1024, num_inference_steps=30, guidance_scale=7.5 ).images[0] return image def generate_image(user_input, input_image, image_prompt): start_time = time.time() if image_prompt == "": # generate keywords from image gr.Info('Starting to generate caption for input image') img_caption, img_colors = get_image_keywords(input_image) # filter user input gr.Info('Processing keywords for inappropriate language') user_input = filter_inappropriate(user_input) # construct prompt from image caption, image colors, and user input gr.Info('Constructing prompt') full_prompt = construct_prompt(img_caption, img_colors, user_input) print(f"FULL PROMPT: {full_prompt}") prompt = image_prompt if image_prompt != "" else full_prompt # text2img generation with full prompt construction image = text2img_inference(prompt) watermarkable_image = prepare_image_for_watermark(image) end_time = time.time() torch.cuda.empty_cache() # Calculate the elapsed time elapsed_time = end_time - start_time print("Elapsed time:", elapsed_time, "seconds") return watermarkable_image, prompt, image gradio_app = gr.Interface( fn=generate_image, inputs=[gr.Text(label="User Keywords"), gr.Image(label="Input Image", type='filepath'), gr.Text(label="Generated Prompt")], outputs=[gr.Image(label="Image Generation"), gr.Text(label="Image Prompt"), gr.Image(label="Raw Image Generation")], title="Custom Tabi", description="Enter keywords and upload image to generate a custom Tabi boot" ) gradio_app.launch(debug=True)