File size: 3,216 Bytes
f7ce2e3
840bcef
fcc3aa5
 
 
840bcef
fcc3aa5
 
 
f7ce2e3
 
fcc3aa5
 
 
 
 
f7ce2e3
fcc3aa5
 
c4f6597
fcc3aa5
 
f7ce2e3
fcc3aa5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f7ce2e3
 
fcc3aa5
 
 
 
840bcef
fcc3aa5
840bcef
 
fcc3aa5
 
 
 
 
 
 
 
840bcef
 
 
fcc3aa5
 
 
 
 
 
840bcef
fcc3aa5
 
 
840bcef
fcc3aa5
 
 
 
 
 
 
 
 
 
 
 
 
 
f7ce2e3
fcc3aa5
 
 
 
 
 
 
f7ce2e3
fcc3aa5
 
 
 
 
 
 
 
 
 
 
 
f7ce2e3
fcc3aa5
840bcef
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import gradio as gr
import spaces
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
from PIL import Image
from datetime import datetime
import numpy as np
import os


DESCRIPTION = """
# Migician Interface
This is a demo Space for paper Migician: Revealing the Magic of Free-Form Multi-Image Grounding in Multimodal Large Language Models.[ACL 2025]
Come and feel the magic of multi-image grounding!
"""

model_id = "Michael4933/Migician" 
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id, torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_id)

def array_to_image_path(image_array):
    if image_array is None:
        raise ValueError("No image provided. Please upload an image before submitting.")
    # Convert numpy array to PIL Image
    img = Image.fromarray(np.uint8(image_array))
    
    # Generate a unique filename using timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"image_{timestamp}.png"
    
    # Save the image
    img.save(filename)
    
    # Get the full path of the saved image
    full_path = os.path.abspath(filename)
    
    return full_path


@spaces.GPU
def run_example(image, text_input=None):
    image_path = array_to_image_path(image)
    image = Image.fromarray(image).convert("RGB")
    messages = [
    {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": image_path,
                },
                {
                    "type": "text", 
                    "text": text_input
                },
            ],
        }
    ]
    
    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")
    
    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens=1024)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    
    return output_text[0]

css = """
  #output {
    height: 500px; 
    overflow: auto; 
    border: 1px solid #ccc; 
  }
"""

with gr.Blocks(css=css) as demo:
    gr.Markdown(DESCRIPTION)
    with gr.Tab(label="Feel the Magic of Multi-Image Grounding"):
        with gr.Row():
            with gr.Column():
                input_img = gr.Image(label="Input Picture")
                text_input = gr.Textbox(label="Question")
                submit_btn = gr.Button(value="Submit")
            with gr.Column():
                output_text = gr.Textbox(label="Output Text")

        submit_btn.click(run_example, [input_img, text_input], [output_text])

demo.queue(api_open=False)
demo.launch(debug=True)