ShowUI-Narrator is a lightweight (2B) framework to narrate the user's action in GUI video / screenshots built upon YOLO-v8, Qwen2VL and ShowUI.

Quick Start: Import dependencies

pip install -r .requirements.txt

The Overview of Action-Narration Pipeline.

ShowUI

Download Vision Language Model

import torch
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
import os

model = Qwen2VLForConditionalGeneration.from_pretrained(
    'FRank62Wu/ShowUI-Narrator', torch_dtype="auto", device_map="cuda"
)


processor = AutoProcessor.from_pretrained('FRank62Wu/ShowUI-Narrator')   
processor.tokenizer.pad_token = processor.tokenizer.eos_token

Cursor detector Example

  1. Load the detector model and defined class for image cropping
import os
import base64
from PIL import Image
from io import BytesIO
import copy

import cv2
from ultralytics import YOLO

def image_to_base64(img_path):
        with open(img_path, "rb") as img_file:
            encoded_img = base64.b64encode(img_file.read()).decode("utf-8")
        return encoded_img

check_point_path = './ShowUI_Action_Narrator_cursor_detect/best.pt'

class Screenshots_processor:
    def __init__(self, img_path, max_size, delta, check_point_path):
        self.img_path = img_path
        self.cursor_model = YOLO(check_point_path)
        self.scs = []
        self.crop_scs =[]
        self.max_size = max_size
        self.delta = delta
        
    def create_crop(self):
        for each in sorted(os.listdir(self.img_path)):
            if each.endswith('.jsonl') or '_crop' in each:
                continue
            else:
               each = os.path.join(self.img_path, each)
               self.scs.append(each)
        
        frame_x, frame_y = [], []
        for idx, image_path in enumerate(self.scs):
                results = self.cursor_model(image_path)
                img = Image.open(image_path)
                width, height = img.size
                img.close()

                for result in results:
                    if result.boxes.xywh.size(0) > 0:
                        boxes = result.boxes
                        xywh_tensor = boxes.xywh
                        x, y = xywh_tensor[0][0].item(), xywh_tensor[0][1].item()
                        frame_x.append(x)
                        frame_y.append(y)
                    else:
                        print('Cursor not detected')
                        
        if len(frame_x) == 0 or len(frame_y) ==0:
            self.crop_scs = copy.deepcopy(self.scs)
            return self.crop_scs

        elif (len(frame_x) <= 1) or (max(frame_x)- min(frame_x))>=self.max_size or (max(frame_y)- min(frame_y))>=self.max_size:
            print('add margin')
            mid_x, mid_y = sum(frame_x) // len(frame_x), sum(frame_y) // len(frame_y)
            margin_= self.max_size + self.delta
            for idx, each in enumerate(sorted(self.scs)):
                image_path = each
                image1 = Image.open(image_path).convert('RGB')
                file_name_tail = image_path.split('/')[-1]
                save_path = image_path.replace(file_name_tail, f'{idx}_crop.jpg')

                x1 = max(0, min(width - margin_, mid_x - margin_ // 2))
                y1 = max(0, min(height - margin_, mid_y - margin_ // 2))
                x2 = min(x1 + margin_, width)
                y2 = min(y1 + margin_, height)
                start_crop = image1.crop((x1, y1, x2, y2))

                start_crop.save(save_path)
                self.crop_scs.append(save_path)
                image1.close()
                return self.crop_scs, self.scs

        else: 
            mid_x, mid_y = sum(frame_x) // len(frame_x), sum(frame_y) // len(frame_y)
            margin = self.max_size
            x1 = max(0, min(width - margin, mid_x - margin // 2))
            y1 = max(0, min(height - margin, mid_y - margin // 2))
            x2 = min(x1 + margin, width)
            y2 = min(y1 + margin, height)
            for idx, each in enumerate(sorted(self.scs)):
                image_path = each
                image1 = Image.open(image_path).convert('RGB')
                file_name_tail = image_path.split('/')[-1].replace('frame_','').replace('.png','')
                save_path = image_path.replace(file_name_tail, f'{idx}_crop.jpg')

                x1 = max(0, min(width - margin_, mid_x - margin_ // 2))
                y1 = max(0, min(height - margin_, mid_y - margin_ // 2))
                x2 = min(x1 + margin_, width)
                y2 = min(y1 + margin_, height)
                start_crop = image1.crop((x1, y1, x2, y2))

                start_crop.save(save_path)
                self.crop_scs.append(save_path)
                image1.close()
                return self.crop_scs, self.scs



class Videoscreen_processor:
    def __init__(self, vid_path, fps, max_size, delta, check_point_path):
        self.vid_path = vid_path
        self.fps = fps
        self.cursor_model = YOLO(check_point_path)
        self.scs = []
        self.crop_scs =[]
        self.max_size = max_size
        self.delta = delta
        
        
        
    def sample_from_video(self):
        
        video_path_tail = self.vid_path.split('/')[-1]
        cap = cv2.VideoCapture(self.vid_path)
        if not cap.isOpened():
            print("Error: Could not open video.")
            return []
        video_fps = cap.get(cv2.CAP_PROP_FPS)  # fps
        print(video_fps)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) 
        frame_interval = int(video_fps // self.fps)  
        frame_count = 0 
        frame_num = 0
        while True:
            ret, frame = cap.read()
            if not ret:
                break  
            if frame_count>1:
                break
            if frame_num % frame_interval == 0:
                frame_count = frame_num // frame_interval
                image_path = os.path.join(self.vid_path.replace(video_path_tail, f"frame_{frame_count}.jpg"))
                self.scs.append(image_path)
                frame_count += 1
                cv2.imwrite(image_path, frame)
            frame_num += 1
        cap.release()  
        
        frame_x, frame_y = [], []
        for idx, image_path in enumerate(self.scs):
                results = self.cursor_model(image_path)
                img = Image.open(image_path)
                width, height = img.size
                img.close()

                for result in results:
                    if result.boxes.xywh.size(0) > 0:
                        boxes = result.boxes
                        xywh_tensor = boxes.xywh
                        x, y = xywh_tensor[0][0].item(), xywh_tensor[0][1].item()
                        frame_x.append(x)
                        frame_y.append(y)
                    else:
                        print('Cursor not detected')
                        
        if len(frame_x) == 0 or len(frame_y) ==0:
            self.crop_scs = copy.deepcopy(self.scs)
            return self.crop_scs, self.crop_scs

        elif (len(frame_x) <= 1) or (max(frame_x)- min(frame_x))>=self.max_size or (max(frame_y)- min(frame_y))>=self.max_size:
            print('add margin')
            mid_x, mid_y = sum(frame_x) // len(frame_x), sum(frame_y) // len(frame_y)
            margin_= self.max_size + self.delta
            for idx, each in enumerate(sorted(self.scs)):
                image_path = each
                image1 = Image.open(image_path).convert('RGB')
                file_name_tail = image_path.split('/')[-1]
                save_path = image_path.replace(file_name_tail, f'{idx}_crop.jpg')

                x1 = max(0, min(width - margin_, mid_x - margin_ // 2))
                y1 = max(0, min(height - margin_, mid_y - margin_ // 2))
                x2 = min(x1 + margin_, width)
                y2 = min(y1 + margin_, height)
                start_crop = image1.crop((x1, y1, x2, y2))

                start_crop.save(save_path)
                self.crop_scs.append(save_path)
                image1.close()
                return self.crop_scs, self.scs

        else: 
            mid_x, mid_y = sum(frame_x) // len(frame_x), sum(frame_y) // len(frame_y)
            margin = self.max_size
            x1 = max(0, min(width - margin, mid_x - margin // 2))
            y1 = max(0, min(height - margin, mid_y - margin // 2))
            x2 = min(x1 + margin, width)
            y2 = min(y1 + margin, height)
            for idx, each in enumerate(sorted(self.scs)):
                image_path = each
                image1 = Image.open(image_path).convert('RGB')
                file_name_tail = image_path.split('/')[-1].replace('frame_','').replace('.png','')
                save_path = image_path.replace(file_name_tail, f'{idx}_crop.jpg')

                x1 = max(0, min(width - margin_, mid_x - margin_ // 2))
                y1 = max(0, min(height - margin_, mid_y - margin_ // 2))
                x2 = min(x1 + margin_, width)
                y2 = min(y1 + margin_, height)
                start_crop = image1.crop((x1, y1, x2, y2))

                start_crop.save(save_path)
                self.crop_scs.append(save_path)
                image1.close()
                return self.crop_scs, self.scs
    
  1. Initate the cropping strategy
Cursor_detector = Screenshots_processor('./storage/folder_to_screenshots',512, 128, check_point_path)

cropped_imgs_list, original_imgs_list = Cursor_detector.create_crop()

Inference Example

  1. Load Model and Prompt Space
"""load model"""
import torch
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
import os
import json
import codecs 
import argparse
import random 
import re


max_pixels_temp = 160*28*28
max_pixels_narr = 760*28*28
min_pixels_narr = 240*28*28




model = Qwen2VLForConditionalGeneration.from_pretrained(
    'FRank62Wu/ShowUI-Narrator', torch_dtype="auto", device_map="cuda"
)


processor = AutoProcessor.from_pretrained('FRank62Wu/ShowUI-Narrator')   
processor.tokenizer.pad_token = processor.tokenizer.eos_token


_SYSTEM_PROMPT='For the given video frames of a GUI action, The frames are decribed in the format of <0> to <{N}>.'



_SYSTEM_PROMPT_NARR='''You are an ai assistant to narrate the action of the user for the video frames in the following detail.
'Action': The type of action
'Element': The target of the action
'Source': The starting position (Applicable for action type: Drag)
'Destination': The ending position (Applicable for action type: Drag)
'Purpose': The intended result of the action
The Action include left click, right click, double click, drag, or Keyboard type.
'''


Action_no_reference_grounding = [
     'Describe the start frame and the end frame of the action in this video?',
     'When Did the action happened in this video? Tell me the start frame and the end frame.',
     'Locate the start and the end frame of the action in this video',
     "Observe the cursor in this GUI video, marking start and end frame of the action in video frames."
]


Dense_narration_query = ['Narrate the action in the given video.',
                         'Describe the action of the user in the given frames',
                         'Describe the action in this video.',
                         'Narrate the action detail of the user in the video.']
  1. Round 1: Temporal grounding to detect keyframes. (We take actions from PR as an example)
path_to_data =''

query = _SYSTEM_PROMPT.format(N=9) + ' ' + random.choice(Action_no_reference_grounding)
messages = [
        {
            'role': 'user', 
            'content': [
                        {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/0_crop.png","max_pixels": max_pixels_temp},
                        {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/1_crop.png","max_pixels": max_pixels_temp},
                        {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/2_crop.png","max_pixels": max_pixels_temp},
                        {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/3_crop.png","max_pixels": max_pixels_temp},
                        {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/4_crop.png","max_pixels": max_pixels_temp},
                        {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/5_crop.png","max_pixels": max_pixels_temp},
                        {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/6_crop.png","max_pixels": max_pixels_temp},
                        {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/7_crop.png","max_pixels": max_pixels_temp},
                        {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/8_crop.png","max_pixels": max_pixels_temp},
                        {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/9_crop.png","max_pixels": max_pixels_temp},
                        {'type':"text",'text': query},
                        ]
        }   
    ]



## round_1 for temporal grounding
text = processor.apply_chat_template(
                            messages, tokenize=False, add_generation_prompt=True,
                        )
          
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(output_text)
>>> Output: <6> and <8>

ShowUI ShowUI

ShowUI ShowUI

  1. Round 2: Use selected keyframes for generate captions in JSON format.
# round_2 for dense narration caption
try:
    matches = re.search(r"<(\w+)>.*?<(\w+)>", output_text)
    s1, e1 = int(matches.group(1)), int(matches.group(2))
except:
    s1, e1 =0, 9
    

query = _SYSTEM_PROMPT_NARR + ' ' + random.choice(Dense_narration_query)

selected_images = []

if e1-s1<=3:
    pixels_narr = max_pixels_narr
else:
    pixels_narr = min_pixels_narr
    
    
for idx, each in enumerate(messages[0]['content']):
        if idx >= s1 and idx <= e1:
            new_image = each.copy()
            new_image['max_pixels'] =pixels_narr
            selected_images.append(new_image)
            
            
messages = [
        {
            'role': 'user', 
            'content':selected_images+ [{'type':"text",'text': query},
                        ] 
        }   
    ]

text = processor.apply_chat_template(
                            messages, tokenize=False, add_generation_prompt=True,
                        )
          
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text_narration = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(output_text_narration)
    
>>> Output: {"Action": "double click", "Element": "sc2 trans shape button", "Source": null, "Destination": null, "Purpose": " Select the SC2 Trans Shape."}
Downloads last month
1
Safetensors
Model size
2.21B params
Tensor type
BF16
·
Inference Providers NEW
This model is not currently available via any of the supported Inference Providers.
The model cannot be deployed to the HF Inference API: The model has no library tag.

Model tree for FRank62Wu/ShowUI-Narrator

Base model

Qwen/Qwen2-VL-2B
Finetuned
(157)
this model

Dataset used to train FRank62Wu/ShowUI-Narrator