Photo2ShojoManga

Running on Zero

App Files Files Community

tori29umai commited on Jun 19

Commit

3542be4

•

1 Parent(s): 9330b76

app.py

Browse files

Files changed (7) hide show

app.py +154 -0
config.json +57 -0
requirements.txt +21 -0
utils/prompt_analysis.py +41 -0
utils/prompt_utils.py +28 -0
utils/tagger.py +149 -0
utils/utils.py +76 -0

app.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import spaces
+import gradio as gr
+from gradio_imageslider import ImageSlider
+import torch
+torch.jit.script = lambda f: f
+from diffusers import (
+    ControlNetModel,
+    StableDiffusionXLControlNetImg2ImgPipeline,
+    DDIMScheduler,
+)
+from controlnet_aux import AnylineDetector
+from compel import Compel, ReturnedEmbeddingsType
+from PIL import Image
+import os
+import time
+import numpy as np
+from utils.utils import load_cn_model, load_cn_config, load_tagger_model, resize_image_aspect_ratio, base_generation
+from utils.prompt_analysis import PromptAnalysis
+path = os.getcwd()
+cn_dir = f"{path}/controlnet"
+tagger_dir = f"{path}/tagger"
+load_cn_model(cn_dir)
+load_cn_config(cn_dir)
+load_tagger_model(tagger_dir)
+IS_SPACES_ZERO = os.environ.get("SPACES_ZERO_GPU", "0") == "1"
+IS_SPACE = os.environ.get("SPACE_ID", None) is not None
+device = "cuda" if torch.cuda.is_available() else "cpu"
+dtype = torch.float16
+LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1"
+print(f"device: {device}")
+print(f"dtype: {dtype}")
+print(f"low memory: {LOW_MEMORY}")
+model = "cagliostrolab/animagine-xl-3.1"
+scheduler = DDIMScheduler.from_pretrained(model, subfolder="scheduler")
+controlnet = ControlNetModel.from_pretrained(cn_dir, torch_dtype=torch.float16, use_safetensors=True)
+pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained(
+    model,
+    controlnet=controlnet,
+    torch_dtype=dtype,
+    variant="fp16",
+    use_safetensors=True,
+    scheduler=scheduler,
+)
+compel = Compel(
+    tokenizer=[pipe.tokenizer, pipe.tokenizer_2],
+    text_encoder=[pipe.text_encoder, pipe.text_encoder_2],
+    returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
+    requires_pooled=[False, True],
+)
+pipe = pipe.to(device)
+@spaces.GPU
+def predict(
+    input_image,
+    prompt,
+    negative_prompt,
+    controlnet_conditioning_scale,
+):
+    base_size =input_image.size
+    resize_image= resize_image_aspect_ratio(input_image)
+    resize_image_size = resize_image.size
+    width = resize_image_size[0]
+    height = resize_image_size[1]
+    white_base_pil = base_generation(resize_image.size, (255, 255, 255, 255)).convert("RGB")
+    conditioning, pooled = compel([prompt, negative_prompt])
+    generator = torch.manual_seed(0)
+    last_time = time.time()
+    output_image = pipe(
+        image=white_base_pil,
+        control_image=resize_image,
+        strength=1.0,
+        prompt_embeds=conditioning[0:1],
+        pooled_prompt_embeds=pooled[0:1],
+        negative_prompt_embeds=conditioning[1:2],
+        negative_pooled_prompt_embeds=pooled[1:2],
+        width=width,
+        height=height,
+        controlnet_conditioning_scale=float(controlnet_conditioning_scale),
+        controlnet_start=0.0,
+        controlnet_end=1.0,
+        generator=generator,
+        num_inference_steps=30,
+        guidance_scale=8.5,
+        eta=1.0,
+    )
+    print(f"Time taken: {time.time() - last_time}")
+    output_image = output_image.resize(base_size, Image.LANCZOS)
+    return output_image
+css = """
+#intro{
+    # max-width: 32rem;
+    # text-align: center;
+    # margin: 0 auto;
+}
+"""
+with gr.Blocks(css=css) as demo:
+    with gr.Row() as block:
+        with gr.Column():
+            # 画像アップロード用の行
+            with gr.Row():
+                with gr.Column():
+                    input_image = gr.Image(label="入力画像", type="pil")
+            # プロンプト入力用の行
+            with gr.Row():
+                prompt_analysis = PromptAnalysis(tagger_dir)
+                [prompt, nega] = PromptAnalysis.layout(input_image)
+            # 画像の詳細設定用のスライダー行
+            with gr.Row():
+                controlnet_conditioning_scale = gr.Slider(minimum=0.5, maximum=1.25, value=1.0, step=0.01, interactive=True, label="ラインアートの忠実度")
+            # 画像生成ボタンの行
+            with gr.Row():
+                generate_button = gr.Button("生成", interactive=False)
+        with gr.Column():
+            output_image = gr.Image(type="pil", label="Output Image")
+        # インプットとアウトプットの設定
+        inputs = [
+            input_image,
+            prompt,
+            nega,
+            controlnet_conditioning_scale,
+        ]
+        outputs = [output_image]
+        # ボタンのクリックイベントを設定
+        generate_button.click(
+            fn=predict,
+            inputs=[input_image, prompt, nega, controlnet_conditioning_scale],
+            outputs=[output_image]
+        )
+# デモの設定と起動
+demo.queue(api_open=True)
+demo.launch(show_api=True)

config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "_class_name": "ControlNetModel",
+  "_diffusers_version": "0.27.2",
+  "act_fn": "silu",
+  "addition_embed_type": "text_time",
+  "addition_embed_type_num_heads": 64,
+  "addition_time_embed_dim": 256,
+  "attention_head_dim": [
+    5,
+    10,
+    20
+  ],
+  "block_out_channels": [
+    320,
+    640,
+    1280
+  ],
+  "class_embed_type": null,
+  "conditioning_channels": 3,
+  "conditioning_embedding_out_channels": [
+    16,
+    32,
+    96,
+    256
+  ],
+  "controlnet_conditioning_channel_order": "rgb",
+  "cross_attention_dim": 2048,
+  "down_block_types": [
+    "DownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "encoder_hid_dim": null,
+  "encoder_hid_dim_type": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "global_pool_conditions": false,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock2DCrossAttn",
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_attention_heads": null,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "projection_class_embeddings_input_dim": 2816,
+  "resnet_time_scale_shift": "default",
+  "transformer_layers_per_block": [
+    1,
+    2,
+    10
+  ],
+  "upcast_attention": null,
+  "use_linear_projection": true
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+gradio==4.29.0
+accelerate
+transformers
+torchvision
+xformers
+accelerate
+invisible-watermark
+huggingface-hub
+hf-transfer
+gradio_imageslider==0.0.20
+compel
+opencv-python
+numpy
+diffusers==0.27.0
+transformers
+accelerate
+safetensors
+hidiffusion==0.1.8
+spaces
+torch==2.2
+controlnet-aux==0.0.9

utils/prompt_analysis.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import os
+import gradio as gr
+from utils.prompt_utils import remove_color
+from utils.tagger import modelLoad, analysis
+class PromptAnalysis:
+    def __init__(self, app_config, post_filter=True,
+                 default_nagative_prompt="lowres, error, extra digit, fewer digits, cropped, worst quality, "
+                                         "low quality, normal quality, jpeg artifacts, blurry"):
+        self.default_nagative_prompt = default_nagative_prompt
+        self.post_filter = post_filter
+        self.model = None
+        self.model_dir = os.path.join(app_config.dpath, 'models/tagger')
+    def layout(self, lang_util, input_image):
+        with gr.Column():
+            with gr.Row():
+                self.prompt = gr.Textbox(label=lang_util.get_text("prompt"), lines=3)
+            with gr.Row():
+                self.negative_prompt = gr.Textbox(label=lang_util.get_text("negative_prompt"), lines=3, value=self.default_nagative_prompt)
+            with gr.Row():
+                self.prompt_analysis_button = gr.Button(lang_util.get_text("analyze_prompt"))
+        self.prompt_analysis_button.click(
+            self.process_prompt_analysis,
+            inputs=[input_image],
+            outputs=self.prompt
+        )
+        return [self.prompt, self.negative_prompt]
+    def process_prompt_analysis(self, input_image_path):
+        if self.model is None:
+            self.model = modelLoad(self.model_dir)
+        tags = analysis(input_image_path, self.model_dir, self.model)
+        tags_list = tags
+        if self.post_filter:
+            tags_list = remove_color(tags)
+        return tags_list

utils/prompt_utils.py ADDED Viewed

	@@ -0,0 +1,28 @@

+def remove_duplicates(base_prompt):
+    # タグの重複を取り除く
+    prompt_list = base_prompt.split(", ")
+    seen = set()
+    unique_tags = []
+    for tag in prompt_list :
+        tag_clean = tag.lower().strip()
+        if tag_clean not in seen and tag_clean != "":
+            unique_tags.append(tag)
+            seen.add(tag_clean)
+    return ", ".join(unique_tags)
+def remove_color(base_prompt):
+    # タグの色情報を取り除く
+    prompt_list = base_prompt.split(", ")
+    color_list = ["pink", "red", "orange", "brown", "yellow", "green", "blue", "purple", "blonde", "colored skin", "white hair"]
+    # カラータグを除去します。
+    cleaned_tags = [tag for tag in prompt_list if all(color.lower() not in tag.lower() for color in color_list)]
+    return ", ".join(cleaned_tags)
+def execute_prompt(execute_tags, base_prompt):
+    prompt_list = base_prompt.split(", ")
+    # execute_tagsを除去
+    filtered_tags = [tag for tag in prompt_list if tag not in execute_tags]
+    # 最終的なプロンプトを生成
+    return ", ".join(filtered_tags)

utils/tagger.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# -*- coding: utf-8 -*-
+# https://github.com/kohya-ss/sd-scripts/blob/main/finetune/tag_images_by_wd14_tagger.py
+import csv
+import os
+os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
+from PIL import Image
+import cv2
+import numpy as np
+from pathlib import Path
+import onnx
+import onnxruntime as ort
+# from wd14 tagger
+IMAGE_SIZE = 448
+model = None  # Initialize model variable
+def convert_array_to_bgr(array):
+    """
+    Convert a NumPy array image to BGR format regardless of its original format.
+    Parameters:
+    - array: NumPy array of the image.
+    Returns:
+    - A NumPy array representing the image in BGR format.
+    """
+    # グレースケール画像（2次元配列）
+    if array.ndim == 2:
+        # グレースケールをBGRに変換（3チャンネルに拡張）
+        bgr_array = np.stack((array,) * 3, axis=-1)
+    # RGBAまたはRGB画像（3次元配列）
+    elif array.ndim == 3:
+        # RGBA画像の場合、アルファチャンネルを削除
+        if array.shape[2] == 4:
+            array = array[:, :, :3]
+        # RGBをBGRに変換
+        bgr_array = array[:, :, ::-1]
+    else:
+        raise ValueError("Unsupported array shape.")
+    return bgr_array
+def preprocess_image(image):
+    image = np.array(image)
+    image = convert_array_to_bgr(image)
+    size = max(image.shape[0:2])
+    pad_x = size - image.shape[1]
+    pad_y = size - image.shape[0]
+    pad_l = pad_x // 2
+    pad_t = pad_y // 2
+    image = np.pad(image, ((pad_t, pad_y - pad_t), (pad_l, pad_x - pad_l), (0, 0)), mode="constant", constant_values=255)
+    interp = cv2.INTER_AREA if size > IMAGE_SIZE else cv2.INTER_LANCZOS4
+    image = cv2.resize(image, (IMAGE_SIZE, IMAGE_SIZE), interpolation=interp)
+    image = image.astype(np.float32)
+    return image
+def modelLoad(model_dir):
+    onnx_path = os.path.join(model_dir, "model.onnx")
+    # 実行プロバイダーをCPUのみに指定
+    providers = ['CPUExecutionProvider']
+    # InferenceSessionの作成時にプロバイダーのリストを指定
+    ort_session = ort.InferenceSession(onnx_path, providers=providers)
+    input_name = ort_session.get_inputs()[0].name
+    # 実際に使用されているプロバイダーを取得して表示
+    actual_provider = ort_session.get_providers()[0]  # 使用されているプロバイダー
+    print(f"Using provider: {actual_provider}")
+    return [ort_session, input_name]
+def analysis(image_path, model_dir, model):
+    ort_session = model[0]
+    input_name = model[1]
+    with open(os.path.join(model_dir, "selected_tags.csv"), "r", encoding="utf-8") as f:
+        reader = csv.reader(f)
+        l = [row for row in reader]
+        header = l[0]  # tag_id,name,category,count
+        rows = l[1:]
+    assert header[0] == "tag_id" and header[1] == "name" and header[2] == "category", f"unexpected csv format: {header}"
+    general_tags = [row[1] for row in rows[1:] if row[2] == "0"]
+    character_tags = [row[1] for row in rows[1:] if row[2] == "4"]
+    tag_freq = {}
+    undesired_tags = ["transparent background"]
+    # 画像をロードして前処理する
+    if image_path:
+        # 画像を開き、RGBA形式に変換して透過情報を保持
+        img = Image.open(image_path)
+        img = img.convert("RGBA")
+        # 透過部分を白色で塗りつぶすキャンバスを作成
+        canvas_image = Image.new('RGBA', img.size, (255, 255, 255, 255))
+        # 画像をキャンバスにペーストし、透過部分が白色になるように設定
+        canvas_image.paste(img, (0, 0), img)
+        # RGBAからRGBに変換し、透過部分を白色にする
+        image_pil = canvas_image.convert("RGB")
+    image_preprocessed = preprocess_image(image_pil)
+    image_preprocessed = np.expand_dims(image_preprocessed, axis=0)
+    # 推論を実行
+    prob = ort_session.run(None, {input_name: image_preprocessed})[0][0]
+    # タグを生成
+    combined_tags = []
+    general_tag_text = ""
+    character_tag_text = ""
+    remove_underscore = True
+    caption_separator = ", "
+    general_threshold = 0.35
+    character_threshold = 0.35
+    for i, p in enumerate(prob[4:]):
+        if i < len(general_tags) and p >= general_threshold:
+            tag_name = general_tags[i]
+            if remove_underscore and len(tag_name) > 3:  # ignore emoji tags like >_< and ^_^
+                tag_name = tag_name.replace("_", " ")
+            if tag_name not in undesired_tags:
+                tag_freq[tag_name] = tag_freq.get(tag_name, 0) + 1
+                general_tag_text += caption_separator + tag_name
+                combined_tags.append(tag_name)
+        elif i >= len(general_tags) and p >= character_threshold:
+            tag_name = character_tags[i - len(general_tags)]
+            if remove_underscore and len(tag_name) > 3:
+                tag_name = tag_name.replace("_", " ")
+            if tag_name not in undesired_tags:
+                tag_freq[tag_name] = tag_freq.get(tag_name, 0) + 1
+                character_tag_text += caption_separator + tag_name
+                combined_tags.append(tag_name)
+    # 先頭のカンマを取る
+    if len(general_tag_text) > 0:
+        general_tag_text = general_tag_text[len(caption_separator) :]
+    if len(character_tag_text) > 0:
+        character_tag_text = character_tag_text[len(caption_separator) :]
+    tag_text = caption_separator.join(combined_tags)
+    return tag_text

utils/utils.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import os
+import requests
+from tqdm import tqdm
+import shutil
+from PIL import Image, ImageOps
+import numpy as np
+import cv2
+def load_cn_model(model_dir):
+  folder = model_dir
+  file_name = 'diffusion_pytorch_model.safetensors'
+  url = "https://huggingface.co/kataragi/ControlNet-LineartXL/resolve/main/Katarag_lineartXL-fp16.safetensors"
+  file_path = os.path.join(folder, file_name)
+  if not os.path.exists(file_path):
+    response = requests.get(url, stream=True)
+    total_size = int(response.headers.get('content-length', 0))
+    with open(file_path, 'wb') as f, tqdm(
+            desc=file_name,
+            total=total_size,
+            unit='iB',
+            unit_scale=True,
+            unit_divisor=1024,
+        ) as bar:
+        for data in response.iter_content(chunk_size=1024):
+            size = f.write(data)
+            bar.update(size)
+def load_cn_config(model_dir):
+  folder = model_dir
+  file_name = 'config.json'
+  file_path = os.path.join(folder, file_name)
+  if not os.path.exists(file_path):
+     config_path = os.path.join(os.getcwd(), file_name)
+     shutil.copy(config_path, file_path)
+def resize_image_aspect_ratio(image):
+    # 元の画像サイズを取得
+    original_width, original_height = image.size
+    # アスペクト比を計算
+    aspect_ratio = original_width / original_height
+    # 標準のアスペクト比サイズを定義
+    sizes = {
+        1: (1024, 1024),  # 正方形
+        4/3: (1152, 896),  # 横長画像
+        3/2: (1216, 832),
+        16/9: (1344, 768),
+        21/9: (1568, 672),
+        3/1: (1728, 576),
+        1/4: (512, 2048),  # 縦長画像
+        1/3: (576, 1728),
+        9/16: (768, 1344),
+        2/3: (832, 1216),
+        3/4: (896, 1152)
+    }
+    # 最も近いアスペクト比を見つける
+    closest_aspect_ratio = min(sizes.keys(), key=lambda x: abs(x - aspect_ratio))
+    target_width, target_height = sizes[closest_aspect_ratio]
+    # リサイズ処理
+    resized_image = image.resize((target_width, target_height), Image.ANTIALIAS)
+    return resized_image
+def base_generation(size, color):
+    canvas = Image.new("RGBA", size, color)
+    return canvas