Florence-2-SD3-Captioner

Running on Zero

File size: 6,700 Bytes

4e70ef0
 
 
 
716f2ea
 
e744f2e
4e70ef0
60fb93f
 
 
9142ec3
 
4e70ef0
 
71141aa
4e70ef0
 
716f2ea
 
 
 
4e70ef0
 
716f2ea
 
 
4e70ef0
716f2ea
 
 
b5c347a
716f2ea
4e70ef0
e744f2e
716f2ea
 
 
 
 
4e70ef0
 
716f2ea
 
 
9af016f
4e70ef0
 
 
 
 
 
 
716f2ea
a32cba6
4e70ef0
716f2ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e70ef0
 
716f2ea
4e70ef0
 
 
 
71141aa
716f2ea
 
4e70ef0
 
 
 
 
 
716f2ea
4e70ef0
 
716f2ea
 
 
ff558e1
716f2ea
 
 
4e70ef0
716f2ea
 
 
 
 
 
 
4e70ef0

import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import re
from PIL import Image 
import os
import numpy as np
import spaces

import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

model = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True).to("cuda").eval()
processor = AutoProcessor.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True)

TITLE = "# [Florence-2 SD3 Long Captioner](https://huggingface.co/gokaygokay/Florence-2-SD3-Captioner/)"
DESCRIPTION = "[Florence-2 Base](https://huggingface.co/microsoft/Florence-2-base-ft) fine-tuned on Long SD3 Prompt and Image pairs. Check above link for datasets that are used for fine-tuning."

def modify_caption(caption: str) -> str:
    special_patterns = [
        (r'The image shows ', ''),      # 匹配 "The image shows " 并替换为空字符串
        (r'The image is .*? of ', ''),  # 匹配 "The image is .*? of" 并替换为空字符串
        (r'of the .*? is', 'is')        # 匹配 "of the .*? is" 并替换为 "is"
    ]
    
    # 对每个特殊模式进行替换
    for pattern, replacement in special_patterns:
        caption = re.sub(pattern, replacement, caption, flags=re.IGNORECASE)
    
    no_blank_lines = re.sub(r'\n\s*\n', '\n', caption)
    # 合并内容
    merged_content = ' '.join(no_blank_lines.strip().splitlines())
    
    return merged_content if merged_content != caption else caption

@spaces.GPU
def process_image(image):
    if isinstance(image, np.ndarray):
        image = Image.fromarray(image)
    elif isinstance(image, str):
        image = Image.open(image)
    if image.mode != "RGB":
        image = image.convert("RGB")
    
    prompt = "<MORE_DETAILED_CAPTION>"
    
    inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        num_beams=3
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(generated_text, task=prompt, image_size=(image.width, image.height))
    return modify_caption(parsed_answer["<MORE_DETAILED_CAPTION>"])

def extract_frames(image_path, output_folder):
    with Image.open(image_path) as img:
        base_name = os.path.splitext(os.path.basename(image_path))[0]
        frame_paths = []
        
        try:
            for i in range(0, img.n_frames):
                img.seek(i)
                frame_path = os.path.join(output_folder, f"{base_name}_frame_{i:03d}.png")
                img.save(frame_path)
                frame_paths.append(frame_path)
        except EOFError:
            pass  # We've reached the end of the sequence
        
        return frame_paths

def process_folder(folder_path):
    if not os.path.isdir(folder_path):
        return "Invalid folder path."
    
    processed_files = []
    skipped_files = []
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.webp', '.heic')):
            image_path = os.path.join(folder_path, filename)
            txt_filename = os.path.splitext(filename)[0] + '.txt'
            txt_path = os.path.join(folder_path, txt_filename)
            
            # Check if the corresponding text file already exists
            if os.path.exists(txt_path):
                skipped_files.append(f"Skipped {filename} (text file already exists)")
                continue
            
            # Check if the image has multiple frames
            with Image.open(image_path) as img:
                if getattr(img, "is_animated", False) and img.n_frames > 1:
                    # Extract frames
                    frames = extract_frames(image_path, folder_path)
                    for frame_path in frames:
                        frame_txt_filename = os.path.splitext(os.path.basename(frame_path))[0] + '.txt'
                        frame_txt_path = os.path.join(folder_path, frame_txt_filename)
                        
                        # Check if the corresponding text file for the frame already exists
                        if os.path.exists(frame_txt_path):
                            skipped_files.append(f"Skipped {os.path.basename(frame_path)} (text file already exists)")
                            continue
                        
                        caption = process_image(frame_path)
                        
                        with open(frame_txt_path, 'w', encoding='utf-8') as f:
                            f.write(caption)
                        
                        processed_files.append(f"Processed {os.path.basename(frame_path)} -> {frame_txt_filename}")
                else:
                    # Process single image
                    caption = process_image(image_path)
                    
                    with open(txt_path, 'w', encoding='utf-8') as f:
                        f.write(caption)
                    
                    processed_files.append(f"Processed {filename} -> {txt_filename}")
    
    result = "\n".join(processed_files + skipped_files)
    return result if result else "No image files found or all files were skipped in the specified folder."

css = """
#output { height: 500px; overflow: auto; border: 1px solid #ccc; }
"""

with gr.Blocks(css=css) as demo:
    gr.Markdown(TITLE)
    gr.Markdown(DESCRIPTION)
    
    with gr.Tab(label="Single Image Processing"):
        with gr.Row():
            with gr.Column():
                input_img = gr.Image(label="Input Picture")
                submit_btn = gr.Button(value="Submit")
            with gr.Column():
                output_text = gr.Textbox(label="Output Text")
        
        gr.Examples(
            [["image1.jpg"], ["image2.jpg"], ["image3.png"], ["image4.jpg"], ["image5.jpg"], ["image6.PNG"]],
            inputs=[input_img],
            outputs=[output_text],
            fn=process_image,
            label='Try captioning on below examples'
        )
        
        submit_btn.click(process_image, [input_img], [output_text])

    with gr.Tab(label="Batch Processing"):
        with gr.Row():
            folder_input = gr.Textbox(label="Input Folder Path")
            batch_submit_btn = gr.Button(value="Process Folder")
        batch_output = gr.Textbox(label="Batch Processing Results", lines=10)
        
        batch_submit_btn.click(process_folder, [folder_input], [batch_output])

demo.launch(debug=True)