File size: 9,657 Bytes
175c9ee
 
 
 
bc0f349
 
175c9ee
 
 
 
 
 
bc0f349
 
 
042c96d
3586102
 
 
 
 
 
 
 
 
 
 
 
 
 
bc0f349
175c9ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67f0068
175c9ee
 
 
 
 
 
 
 
 
 
 
 
407114c
 
 
 
175c9ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc0f349
175c9ee
 
 
 
 
 
 
 
 
 
 
23b0e0e
 
3586102
 
 
 
 
175c9ee
 
 
 
 
 
 
 
 
 
bc0f349
175c9ee
67f0068
 
 
 
 
175c9ee
 
 
407114c
175c9ee
 
 
 
 
 
 
 
 
 
 
 
 
67f0068
175c9ee
 
bc0f349
175c9ee
 
 
 
bc0f349
 
67f0068
 
 
23b0e0e
 
67f0068
 
 
 
175c9ee
8ecd8c8
67f0068
 
87fccd3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67f0068
 
23b0e0e
 
67f0068
 
 
175c9ee
3586102
 
042c96d
e99777e
407114c
3586102
 
67f0068
175c9ee
 
67f0068
175c9ee
 
 
67f0068
 
 
3586102
23b0e0e
175c9ee
67f0068
175c9ee
ce41e3e
407114c
175c9ee
 
 
 
 
67f0068
 
175c9ee
 
 
67f0068
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
import os
import pathlib
import tempfile
from collections.abc import Iterator
from threading import Thread

import av
import gradio as gr
import torch
from gradio.utils import get_upload_folder
from transformers import AutoModelForImageTextToText, AutoProcessor
from transformers.generation.streamers import TextIteratorStreamer
from optimum.intel import OVModelForVisualCausalLM


default_model_id = "echarlaix/SmolVLM2-500M-Video-Instruct-openvino"

model_cache = {
    "model_id" : default_model_id,
    "processor" : AutoProcessor.from_pretrained(default_model_id),
    "model" : OVModelForVisualCausalLM.from_pretrained(default_model_id),
}

def update_model(model_id):
    if model_cache["model_id"] != model_id:
        model_cache["model_id"] = model_id
        model_cache["processor"] = AutoProcessor.from_pretrained(model_id)
        model_cache["model"] = OVModelForVisualCausalLM.from_pretrained(model_id)



IMAGE_FILE_TYPES = (".jpg", ".jpeg", ".png", ".webp")
VIDEO_FILE_TYPES = (".mp4", ".mov", ".webm")

GRADIO_TEMP_DIR = get_upload_folder()

TARGET_FPS = int(os.getenv("TARGET_FPS", "3"))
MAX_FRAMES = int(os.getenv("MAX_FRAMES", "30"))
MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "10_000"))


def get_file_type(path: str) -> str:
    if path.endswith(IMAGE_FILE_TYPES):
        return "image"
    if path.endswith(VIDEO_FILE_TYPES):
        return "video"
    error_message = f"Unsupported file type: {path}"
    raise ValueError(error_message)


def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
    video_count = 0
    non_video_count = 0
    for path in paths:
        if path.endswith(VIDEO_FILE_TYPES):
            video_count += 1
        else:
            non_video_count += 1
    return video_count, non_video_count






def validate_media_constraints(message: dict) -> bool:
    video_count, non_video_count = count_files_in_new_message(message["files"])
    if video_count > 1:
        gr.Warning("Only one video is supported.")
        return False
    if video_count == 1 and non_video_count > 0:
        gr.Warning("Mixing images and videos is not allowed.")
        return False
    return True


def extract_frames_to_tempdir(
    video_path: str,
    target_fps: float,
    max_frames: int | None = None,
    parent_dir: str | None = None,
    prefix: str = "frames_",
) -> str:
    temp_dir = tempfile.mkdtemp(prefix=prefix, dir=parent_dir)

    container = av.open(video_path)
    video_stream = container.streams.video[0]

    if video_stream.duration is None or video_stream.time_base is None:
        raise ValueError("video_stream is missing duration or time_base")

    time_base = video_stream.time_base
    duration = float(video_stream.duration * time_base)
    interval = 1.0 / target_fps

    total_frames = int(duration * target_fps)
    if max_frames is not None:
        total_frames = min(total_frames, max_frames)

    target_times = [i * interval for i in range(total_frames)]
    target_index = 0

    for frame in container.decode(video=0):
        if frame.pts is None:
            continue

        timestamp = float(frame.pts * time_base)

        if target_index < len(target_times) and abs(timestamp - target_times[target_index]) < (interval / 2):
            frame_path = pathlib.Path(temp_dir) / f"frame_{target_index:04d}.jpg"
            frame.to_image().save(frame_path)
            target_index += 1

            if max_frames is not None and target_index >= max_frames:
                break

    container.close()
    return temp_dir


def process_new_user_message(message: dict) -> list[dict]:
    if not message["files"]:
        return [{"type": "text", "text": message["text"]}]

    file_types = [get_file_type(path) for path in message["files"]]

    if len(file_types) == 1 and file_types[0] == "video":
        gr.Info(f"Video will be processed at {TARGET_FPS} FPS, max {MAX_FRAMES} frames in this Space.")

        temp_dir = extract_frames_to_tempdir(
            message["files"][0],
            target_fps=TARGET_FPS,
            max_frames=MAX_FRAMES,
            parent_dir=GRADIO_TEMP_DIR,
        )
        paths = sorted(pathlib.Path(temp_dir).glob("*.jpg"))
        return [
            {"type": "text", "text": message["text"]},
            *[{"type": "image", "image": path.as_posix()} for path in paths],
        ]

    return [
        {"type": "text", "text": message["text"]},
        *[{"type": file_type, file_type: path} for path, file_type in zip(message["files"], file_types, strict=True)],
    ]


def process_history(history: list[dict]) -> list[dict]:
    messages = []
    current_user_content: list[dict] = []
    for item in history:
        if item["role"] == "assistant":
            if current_user_content:
                messages.append({"role": "user", "content": current_user_content})
                current_user_content = []
            messages.append({"role": "assistant", "content": [{"type": "text", "text": item["content"]}]})
        else:
            content = item["content"]
            if isinstance(content, str):
                current_user_content.append({"type": "text", "text": content})
            else:
                filepath = content[0]
                file_type = get_file_type(filepath)
                current_user_content.append({"type": file_type, file_type: filepath})
    return messages


@torch.inference_mode()
def generate(message: dict, history: list[dict], model_id: str, max_new_tokens: int = 512) -> Iterator[str]:
    system_prompt = "You are a helpful assistant."

    update_model(model_id)
    processor = model_cache["processor"]
    model = model_cache["model"]

    if not validate_media_constraints(message):
        yield ""
        return

    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
    messages.extend(process_history(history))
    messages.append({"role": "user", "content": process_new_user_message(message)})

    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    )
    n_tokens = inputs["input_ids"].shape[1]
    if n_tokens > MAX_INPUT_TOKENS:
        gr.Warning(
            f"Input too long. Max {MAX_INPUT_TOKENS} tokens. Got {n_tokens} tokens. This limit is set to avoid out-of-memory errors in this Space."
        )
        yield ""
        return

    # inputs = inputs.to(device=model.device, dtype=torch.bfloat16)

    streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        inputs,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        disable_compile=True,
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    output = ""
    for delta in streamer:
        output += delta
        yield output


examples = [
    [
        {
            "text": "What is on the flower?",
            "files": ["assets/bee.jpg"],
        }
    ],
    [
        {
            "text": "Describe this image in detail.",
            "files": ["assets/dogs.jpg"],
        }
    ],
    [
        {
            "text": "Give me a short and easy recipe for this dish",
            "files": ["assets/recipe_burger.webp"],
        }
    ],

    [
        {
            "text": "I want to go somewhere similar to the one in the photo. Give me destinations and travel tips",
            "files": ["assets/travel_tips.jpg"],
        }
    ],
    [
        {
            "text": "As an art critic AI assistant, could you describe this painting in details and make a thorough critic?",
            "files": ["assets/art_critic.png"],
        }
    ],
    [
        {
            "text": "What is the capital of France?",
            "files": [],
        }
    ],
]


model_choices = [
    "echarlaix/SmolVLM2-500M-Video-Instruct-openvino",
    "echarlaix/SmolVLM2-500M-Video-Instruct-openvino-8bit-static",
    "echarlaix/SmolVLM2-500M-Video-Instruct-openvino-8bit-woq",
]

demo = gr.ChatInterface(
    fn=generate,
    type="messages",
    textbox=gr.MultimodalTextbox(
        file_types=list(IMAGE_FILE_TYPES + VIDEO_FILE_TYPES),
        file_count="multiple",
        autofocus=True,
    ),
    multimodal=True,
    additional_inputs=[
        gr.Dropdown(model_choices, value=model_choices[0], label="Model ID"),
        # gr.Textbox(label="System Prompt", value="You are a helpful assistant."),
        gr.Slider(label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700),
    ],
    stop_btn=False,
    title="Fast quantized SmolVLM2 ⚡",
    description="Play with a [SmolVLM2-500M-Video-Instruct](https://huggingface.co/echarlaix/SmolVLM2-500M-Video-Instruct-openvino) and its quantized variants : [SmolVLM2-500M-Video-Instruct-openvino-8bit-woq](https://huggingface.co/echarlaix/SmolVLM2-500M-Video-Instruct-openvino-8bit-woq) and [SmolVLM2-500M-Video-Instruct-openvino-8bit-static](https://huggingface.co/echarlaix/SmolVLM2-500M-Video-Instruct-openvino-8bit-static) both obtained by respectively applying Weight-Only Quantization and Static Quantization  using [Optimum Intel](https://github.com/huggingface/optimum-intel) NNCF integration. To get started, upload an image and text or try one of the examples. This demo runs on 4th Generation Intel Xeon (Sapphire Rapids) processors.",
    examples=examples,
    run_examples_on_click=False,
    cache_examples=False,
    css_paths="style.css",
    delete_cache=(1800, 1800),
)

if __name__ == "__main__":
    demo.launch()