DiffuseCraftMod

Running on Zero

App Files Files Community

John6666 commited on Jun 25

Commit

1731cc9

•

1 Parent(s): 240160f

Upload 3 files

Browse files

Files changed (3) hide show

app.py +63 -6
fl2sd3longcap.py +75 -0
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -410,20 +410,24 @@ from utils import (
     V2_IDENTITY_OPTIONS
 )
 from tagger import (
-    predict_tags,
     convert_danbooru_to_e621_prompt,
     remove_specific_prompt,
     insert_recom_prompt,
     compose_prompt_to_copy,
     translate_prompt,
 )
 def description_ui():
     gr.Markdown(
         """
 ## Danbooru Tags Transformer V2 Demo with WD Tagger
 (Image =>) Prompt => Upsampled longer prompt
 - Mod of p1atdev's [Danbooru Tags Transformer V2 Demo](https://huggingface.co/spaces/p1atdev/danbooru-tags-transformer-v2) and [WD Tagger with 🤗 transformers](https://huggingface.co/spaces/p1atdev/wd-tagger-transformers).
-- Models: p1atdev's [wd-swinv2-tagger-v3-hf](https://huggingface.co/p1atdev/wd-swinv2-tagger-v3-hf), [dart-v2-moe-sft](https://huggingface.co/p1atdev/dart-v2-moe-sft)
 """
     )
 ## END MOD
@@ -861,6 +865,7 @@ with gr.Blocks(theme="NoCrypt/miku", elem_id="main", css=CSS) as app:
                         tag_type_gui = gr.Radio(label="Convert tags to", info="danbooru for Animagine, e621 for Pony.", choices=["danbooru", "e621"], value="danbooru")
                         recom_prompt_gui = gr.Radio(label="Insert reccomended prompt", choices=["None", "Animagine", "Pony"], value="None", interactive=True)
                         keep_tags_gui = gr.Radio(label="Remove tags leaving only the following", choices=["body", "dress", "all"], value="all")
                     generate_from_image_btn_gui = gr.Button(value="GENERATE TAGS FROM IMAGE", size="lg", variant="primary")
                 with gr.Group():
                     prompt_gui = gr.Textbox(lines=6, placeholder="1girl, solo, ...", label="Prompt", show_copy_button=True)
@@ -1275,6 +1280,49 @@ with gr.Blocks(theme="NoCrypt/miku", elem_id="main", css=CSS) as app:
                                 "Classic",
                                 "Nearest",
                             ],
                             [
                                 "yoshida yuuko, machikado mazoku, 1girl, solo, demon horns,horns, school uniform, long hair, open mouth, skirt, demon girl, ahoge, shiny, shiny hair, anime artwork",
                                 "nsfw, lowres, (bad), text, error, fewer, extra, missing, worst quality, jpeg artifacts, low quality, watermark, unfinished, displeasing, oldest, early, chromatic aberration, signature, extra digits, artistic error, username, scan, [abstract]",
@@ -1453,14 +1501,18 @@ with gr.Blocks(theme="NoCrypt/miku", elem_id="main", css=CSS) as app:
         optimization_gui.change(set_optimization, [optimization_gui, steps_gui, cfg_gui, sampler_gui, clip_skip_gui, lora1_gui, lora_scale_1_gui], [steps_gui, cfg_gui, sampler_gui, clip_skip_gui, lora1_gui, lora_scale_1_gui])
         generate_from_image_btn_gui.click(
-            predict_tags,
-            inputs=[input_image_gui, general_threshold_gui, character_threshold_gui],
             outputs=[
                 series_dbt,
                 character_dbt,
                 prompt_gui,
                 copy_button_dbt,
             ],
         ).then(
             compose_prompt_to_copy, inputs=[character_dbt, series_dbt, prompt_gui], outputs=[prompt_gui]
         ).then(
@@ -1639,6 +1691,7 @@ with gr.Blocks(theme="NoCrypt/miku", elem_id="main", css=CSS) as app:
                         character_threshold = gr.Slider(label="Character threshold", minimum=0.0, maximum=1.0, value=0.8, step=0.01, interactive=True)
                         input_tag_type = gr.Radio(label="Convert tags to", info="danbooru for Animagine, e621 for Pony.", choices=["danbooru", "e621"], value="danbooru")
                         recom_prompt = gr.Radio(label="Insert reccomended prompt", choices=["None", "Animagine", "Pony"], value="None", interactive=True)
                     keep_tags = gr.Radio(label="Remove tags leaving only the following", choices=["body", "dress", "all"], value="all")
                     generate_from_image_btn = gr.Button(value="GENERATE TAGS FROM IMAGE", size="lg", variant="primary")
@@ -1691,14 +1744,18 @@ with gr.Blocks(theme="NoCrypt/miku", elem_id="main", css=CSS) as app:
         translate_input_prompt_button.click(translate_prompt, inputs=[input_copyright], outputs=[input_copyright])
         generate_from_image_btn.click(
-            predict_tags,
-            inputs=[input_image, general_threshold, character_threshold],
             outputs=[
                 input_copyright,
                 input_character,
                 input_general,
                 copy_input_btn,
             ],
         ).then(
             remove_specific_prompt, inputs=[input_general, keep_tags], outputs=[input_general],
         ).then(

     V2_IDENTITY_OPTIONS
 )
 from tagger import (
+    predict_tags_wd,
     convert_danbooru_to_e621_prompt,
     remove_specific_prompt,
     insert_recom_prompt,
     compose_prompt_to_copy,
     translate_prompt,
 )
+from fl2sd3longcap import (
+    predict_tags_fl2_sd3,
+)
 def description_ui():
     gr.Markdown(
         """
 ## Danbooru Tags Transformer V2 Demo with WD Tagger
 (Image =>) Prompt => Upsampled longer prompt
 - Mod of p1atdev's [Danbooru Tags Transformer V2 Demo](https://huggingface.co/spaces/p1atdev/danbooru-tags-transformer-v2) and [WD Tagger with 🤗 transformers](https://huggingface.co/spaces/p1atdev/wd-tagger-transformers).
+- Models: p1atdev's [wd-swinv2-tagger-v3-hf](https://huggingface.co/p1atdev/wd-swinv2-tagger-v3-hf), [dart-v2-moe-sft](https://huggingface.co/p1atdev/dart-v2-moe-sft)\
+, gokaygokay's [Florence-2-SD3-Captioner](https://huggingface.co/gokaygokay/Florence-2-SD3-Captioner)
 """
     )
 ## END MOD
                         tag_type_gui = gr.Radio(label="Convert tags to", info="danbooru for Animagine, e621 for Pony.", choices=["danbooru", "e621"], value="danbooru")
                         recom_prompt_gui = gr.Radio(label="Insert reccomended prompt", choices=["None", "Animagine", "Pony"], value="None", interactive=True)
                         keep_tags_gui = gr.Radio(label="Remove tags leaving only the following", choices=["body", "dress", "all"], value="all")
+                    image_algorithms = gr.CheckboxGroup(["Use WD Tagger", "Use Florence-2-SD3-Long-Captioner"], label="Algorithms", value=["Use WD Tagger"])
                     generate_from_image_btn_gui = gr.Button(value="GENERATE TAGS FROM IMAGE", size="lg", variant="primary")
                 with gr.Group():
                     prompt_gui = gr.Textbox(lines=6, placeholder="1girl, solo, ...", label="Prompt", show_copy_button=True)
                                 "Classic",
                                 "Nearest",
                             ],
+                            [
+                                "1girl, oomuro sakurako, yuru yuri, official art, anime style, school uniform, masterpiece, best quality, very aesthetic, absurdres",
+                                "nsfw, lowres, (bad), text, error, fewer, extra, missing, worst quality, jpeg artifacts, low quality, watermark, unfinished, displeasing, oldest, early, chromatic aberration, signature, extra digits, artistic error, username, scan, [abstract]",
+                                1,
+                                40,
+                                7.5,
+                                True,
+                                -1,
+                                None,
+                                1.0,
+                                None,
+                                1.0,
+                                None,
+                                1.0,
+                                None,
+                                1.0,
+                                None,
+                                1.0,
+                                "Euler",
+                                1024,
+                                1024,
+                                "Raelina/Rae-Diffusion-XL-V2",
+                                "vaes/sdxl.vae.safetensors", # vae
+                                "txt2img",
+                                None, # img conttol
+                                "Canny", # preprocessor
+                                512, # preproc resolution
+                                1024, # img resolution
+                                None, # Style prompt
+                                None, # Style json
+                                None, # img Mask
+                                0.35, # strength
+                                100, # low th canny
+                                200, # high th canny
+                                0.1, # value mstd
+                                0.1, # distance mstd
+                                1.0, # cn scale
+                                0., # cn start
+                                1., # cn end
+                                False, # ti
+                                "Classic",
+                                "Nearest",
+                            ],
                             [
                                 "yoshida yuuko, machikado mazoku, 1girl, solo, demon horns,horns, school uniform, long hair, open mouth, skirt, demon girl, ahoge, shiny, shiny hair, anime artwork",
                                 "nsfw, lowres, (bad), text, error, fewer, extra, missing, worst quality, jpeg artifacts, low quality, watermark, unfinished, displeasing, oldest, early, chromatic aberration, signature, extra digits, artistic error, username, scan, [abstract]",
         optimization_gui.change(set_optimization, [optimization_gui, steps_gui, cfg_gui, sampler_gui, clip_skip_gui, lora1_gui, lora_scale_1_gui], [steps_gui, cfg_gui, sampler_gui, clip_skip_gui, lora1_gui, lora_scale_1_gui])
         generate_from_image_btn_gui.click(
+            predict_tags_wd,
+            inputs=[input_image_gui, prompt_gui, image_algorithms, general_threshold_gui, character_threshold_gui],
             outputs=[
                 series_dbt,
                 character_dbt,
                 prompt_gui,
                 copy_button_dbt,
             ],
+        ).then(
+            predict_tags_fl2_sd3,
+            inputs=[input_image_gui, prompt_gui, image_algorithms],
+            outputs=[prompt_gui],
         ).then(
             compose_prompt_to_copy, inputs=[character_dbt, series_dbt, prompt_gui], outputs=[prompt_gui]
         ).then(
                         character_threshold = gr.Slider(label="Character threshold", minimum=0.0, maximum=1.0, value=0.8, step=0.01, interactive=True)
                         input_tag_type = gr.Radio(label="Convert tags to", info="danbooru for Animagine, e621 for Pony.", choices=["danbooru", "e621"], value="danbooru")
                         recom_prompt = gr.Radio(label="Insert reccomended prompt", choices=["None", "Animagine", "Pony"], value="None", interactive=True)
+                    image_algorithms = gr.CheckboxGroup(["Use WD Tagger", "Use Florence-2-SD3-Long-Captioner"], label="Algorithms", value=["Use WD Tagger"])
                     keep_tags = gr.Radio(label="Remove tags leaving only the following", choices=["body", "dress", "all"], value="all")
                     generate_from_image_btn = gr.Button(value="GENERATE TAGS FROM IMAGE", size="lg", variant="primary")
         translate_input_prompt_button.click(translate_prompt, inputs=[input_copyright], outputs=[input_copyright])
         generate_from_image_btn.click(
+            predict_tags_wd,
+            inputs=[input_image, input_general, image_algorithms, general_threshold, character_threshold],
             outputs=[
                 input_copyright,
                 input_character,
                 input_general,
                 copy_input_btn,
             ],
+        ).then(
+            predict_tags_fl2_sd3,
+            inputs=[input_image, input_general, image_algorithms],
+            outputs=[input_general],
         ).then(
             remove_specific_prompt, inputs=[input_general, keep_tags], outputs=[input_general],
         ).then(

fl2sd3longcap.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from transformers import AutoProcessor, AutoModelForCausalLM
+import spaces
+import re
+from PIL import Image
+import torch
+import subprocess
+subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
+fl_model = AutoModelForCausalLM.from_pretrained('gokaygokay/Florence-2-SD3-Captioner', torch_dtype=torch.float16, attn_implementation="flash_attention_2", trust_remote_code=True).to("cuda").eval()
+fl_processor = AutoProcessor.from_pretrained('gokaygokay/Florence-2-SD3-Captioner', torch_dtype=torch.float16, attn_implementation="flash_attention_2", trust_remote_code=True)
+def fl_modify_caption(caption: str) -> str:
+    """
+    Removes specific prefixes from captions if present, otherwise returns the original caption.
+    Args:
+        caption (str): A string containing a caption.
+    Returns:
+        str: The caption with the prefix removed if it was present, or the original caption.
+    """
+    # Define the prefixes to remove
+    prefix_substrings = [
+        ('captured from ', ''),
+        ('captured at ', '')
+    ]
+    # Create a regex pattern to match any of the prefixes
+    pattern = '|'.join([re.escape(opening) for opening, _ in prefix_substrings])
+    replacers = {opening.lower(): replacer for opening, replacer in prefix_substrings}
+    # Function to replace matched prefix with its corresponding replacement
+    def replace_fn(match):
+        return replacers[match.group(0).lower()]
+    # Apply the regex to the caption
+    modified_caption = re.sub(pattern, replace_fn, caption, count=1, flags=re.IGNORECASE)
+    # If the caption was modified, return the modified version; otherwise, return the original
+    return modified_caption if modified_caption != caption else caption
+@spaces.GPU
+def fl_run_example(image):
+    task_prompt = "<DESCRIPTION>"
+    prompt = task_prompt + "Describe this image in great detail."
+    # Ensure the image is in RGB mode
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+    inputs = fl_processor(text=prompt, images=image, return_tensors="pt").to("cuda")
+    generated_ids = fl_model.generate(
+        input_ids=inputs["input_ids"],
+        pixel_values=inputs["pixel_values"],
+        max_new_tokens=1024,
+        num_beams=3
+    )
+    generated_text = fl_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+    parsed_answer = fl_processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
+    return fl_modify_caption(parsed_answer["<DESCRIPTION>"])
+def predict_tags_fl2_sd3(image: Image.Image, input_tags: str, algo: list[str]):
+    def to_list(s):
+        return [x.strip() for x in s.split(",") if not s == ""]
+    def list_uniq(l):
+        return sorted(set(l), key=l.index)
+    if not "Use Florence-2-SD3-Long-Captioner" in algo:
+        return input_tags
+    tag_list = list_uniq(to_list(input_tags) + to_list(fl_run_example(image) + ", "))
+    tag_list.remove("")
+    return ", ".join(tag_list)

requirements.txt CHANGED Viewed

@@ -12,4 +12,5 @@ huggingface_hub
 diffusers
 httpx==0.13.3
 httpcore
-googletrans==4.0.0rc1

 diffusers
 httpx==0.13.3
 httpcore
+googletrans==4.0.0rc1
+timm