File size: 5,939 Bytes
991b8c0
bfdf2ce
 
 
 
 
 
 
5dfbac5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
991b8c0
5dfbac5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56a78b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5dfbac5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import spaces
import subprocess
# Install flash attention, skipping CUDA build if necessary
subprocess.run(
    "pip install flash-attn --no-build-isolation",
    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
    shell=True,
)
import time
import logging
import gradio as gr
import cv2
import os
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch
from PIL import Image

# Cache for loaded model and processor
default_cache = {'model_id': None, 'processor': None, 'model': None, 'device': None}
model_cache = default_cache.copy()

# Check for XPU availability
has_xpu = hasattr(torch, 'xpu') and torch.xpu.is_available()

def update_model(model_id, device):
    if model_cache['model_id'] != model_id or model_cache['device'] != device:
        logging.info(f'Loading model {model_id} on {device}')
        processor = AutoProcessor.from_pretrained(model_id)
        # Load model with appropriate precision for each device
        if device == 'cuda':
            # Use bfloat16 for CUDA for performance
            model = AutoModelForImageTextToText.from_pretrained(
                model_id,
                torch_dtype=torch.bfloat16,
                _attn_implementation='flash_attention_2'
            ).to('cuda')
        elif device == 'xpu' and has_xpu:
            # Use float32 on XPU to avoid bfloat16 layernorm issues
            model = AutoModelForImageTextToText.from_pretrained(
                model_id,
                torch_dtype=torch.float32
            ).to('xpu')
        else:
            # Default to float32 on CPU
            model = AutoModelForImageTextToText.from_pretrained(model_id).to('cpu')
        model.eval()
        model_cache.update({'model_id': model_id, 'processor': processor, 'model': model, 'device': device})

@spaces.GPU
def caption_frame(frame, model_id, interval_ms, sys_prompt, usr_prompt, device):
    debug_msgs = []
    update_model(model_id, device)
    processor = model_cache['processor']
    model = model_cache['model']

    # Control capture interval
    time.sleep(interval_ms / 1000)

    # Preprocess frame
    t0 = time.time()
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pil_img = Image.fromarray(rgb)
    temp_path = 'frame.jpg'
    pil_img.save(temp_path, format='JPEG', quality=50)
    debug_msgs.append(f'Preprocess: {int((time.time()-t0)*1000)} ms')

    # Prepare multimodal chat messages
    messages = [
        {'role': 'system', 'content': [{'type': 'text', 'text': sys_prompt}]},
        {'role': 'user', 'content': [
            {'type': 'image', 'url': temp_path},
            {'type': 'text', 'text': usr_prompt}
        ]}
    ]

    # Tokenize and encode
    t1 = time.time()
    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors='pt'
    )
    # Move inputs to correct device and dtype (matching model parameters)
    param_dtype = next(model.parameters()).dtype
    cast_inputs = {}
    for k, v in inputs.items():
        if isinstance(v, torch.Tensor):
            if v.dtype.is_floating_point:
                # cast floating-point tensors to model's parameter dtype
                cast_inputs[k] = v.to(device=model.device, dtype=param_dtype)
            else:
                # move integer/mask tensors without changing dtype
                cast_inputs[k] = v.to(device=model.device)
        else:
            cast_inputs[k] = v
    inputs = cast_inputs
    debug_msgs.append(f'Tokenize: {int((time.time()-t1)*1000)} ms')

    # Inference
    t2 = time.time()
    outputs = model.generate(**inputs, do_sample=False, max_new_tokens=128)
    debug_msgs.append(f'Inference: {int((time.time()-t2)*1000)} ms')

    # Decode and strip history
    t3 = time.time()
    raw = processor.batch_decode(outputs, skip_special_tokens=True)[0]
    debug_msgs.append(f'Decode: {int((time.time()-t3)*1000)} ms')
    if "Assistant:" in raw:
        caption = raw.split("Assistant:")[-1].strip()
    else:
        lines = raw.splitlines()
        caption = lines[-1].strip() if len(lines) > 1 else raw.strip()

    return caption, '\n'.join(debug_msgs)


def main():
    logging.basicConfig(level=logging.INFO)
    model_choices = [
        'HuggingFaceTB/SmolVLM2-256M-Video-Instruct',
        'HuggingFaceTB/SmolVLM2-500M-Video-Instruct',
        'HuggingFaceTB/SmolVLM2-2.2B-Instruct'
    ]
    # Determine available devices
    device_options = ['cpu']
    if torch.cuda.is_available():
        device_options.append('cuda')
    if has_xpu:
        device_options.append('xpu')

    default_device = 'cuda' if torch.cuda.is_available() else ('xpu' if has_xpu else 'cpu')

    with gr.Blocks() as demo:
        gr.Markdown('## 🎥 Real-Time Webcam Captioning with SmolVLM2 (Transformers)')

        with gr.Row():
            model_dd  = gr.Dropdown(model_choices, value=model_choices[0], label='Model ID')
            device_dd = gr.Dropdown(device_options, value=default_device, label='Device')

        interval   = gr.Slider(100, 20000, step=100, value=3000, label='Interval (ms)')
        sys_p      = gr.Textbox(lines=2, value='Describe the key action', label='System Prompt')
        usr_p      = gr.Textbox(lines=1, value='What is happening in this image?', label='User Prompt')

        cam        = gr.Image(sources=['webcam'], streaming=True, label='Webcam Feed')
        caption_tb = gr.Textbox(interactive=False, label='Caption')
        log_tb     = gr.Textbox(lines=4, interactive=False, label='Debug Log')

        cam.stream(
            fn=caption_frame,
            inputs=[cam, model_dd, interval, sys_p, usr_p, device_dd],
            outputs=[caption_tb, log_tb],
            time_limit=600
        )

    # Enable Gradio's async event queue to register callback IDs and prevent KeyErrors
    demo.queue()

    # Launch the app
    demo.launch()


if __name__ == '__main__':
    main()