ShowUI-Narrator is a lightweight (2B) framework to narrate the user's action in GUI video / screenshots built upon YOLO-v8, Qwen2VL and ShowUI.
Quick Start: Import dependencies
pip install -r .requirements.txt
The Overview of Action-Narration Pipeline.

Download Vision Language Model
import torch
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
import os
model = Qwen2VLForConditionalGeneration.from_pretrained(
'FRank62Wu/ShowUI-Narrator', torch_dtype="auto", device_map="cuda"
)
processor = AutoProcessor.from_pretrained('FRank62Wu/ShowUI-Narrator')
processor.tokenizer.pad_token = processor.tokenizer.eos_token
Cursor detector Example
- Load the detector model and defined class for image cropping
import os
import base64
from PIL import Image
from io import BytesIO
import copy
import cv2
from ultralytics import YOLO
def image_to_base64(img_path):
with open(img_path, "rb") as img_file:
encoded_img = base64.b64encode(img_file.read()).decode("utf-8")
return encoded_img
check_point_path = './ShowUI_Action_Narrator_cursor_detect/best.pt'
class Screenshots_processor:
def __init__(self, img_path, max_size, delta, check_point_path):
self.img_path = img_path
self.cursor_model = YOLO(check_point_path)
self.scs = []
self.crop_scs =[]
self.max_size = max_size
self.delta = delta
def create_crop(self):
for each in sorted(os.listdir(self.img_path)):
if each.endswith('.jsonl') or '_crop' in each:
continue
else:
each = os.path.join(self.img_path, each)
self.scs.append(each)
frame_x, frame_y = [], []
for idx, image_path in enumerate(self.scs):
results = self.cursor_model(image_path)
img = Image.open(image_path)
width, height = img.size
img.close()
for result in results:
if result.boxes.xywh.size(0) > 0:
boxes = result.boxes
xywh_tensor = boxes.xywh
x, y = xywh_tensor[0][0].item(), xywh_tensor[0][1].item()
frame_x.append(x)
frame_y.append(y)
else:
print('Cursor not detected')
if len(frame_x) == 0 or len(frame_y) ==0:
self.crop_scs = copy.deepcopy(self.scs)
return self.crop_scs
elif (len(frame_x) <= 1) or (max(frame_x)- min(frame_x))>=self.max_size or (max(frame_y)- min(frame_y))>=self.max_size:
print('add margin')
mid_x, mid_y = sum(frame_x) // len(frame_x), sum(frame_y) // len(frame_y)
margin_= self.max_size + self.delta
for idx, each in enumerate(sorted(self.scs)):
image_path = each
image1 = Image.open(image_path).convert('RGB')
file_name_tail = image_path.split('/')[-1]
save_path = image_path.replace(file_name_tail, f'{idx}_crop.jpg')
x1 = max(0, min(width - margin_, mid_x - margin_ // 2))
y1 = max(0, min(height - margin_, mid_y - margin_ // 2))
x2 = min(x1 + margin_, width)
y2 = min(y1 + margin_, height)
start_crop = image1.crop((x1, y1, x2, y2))
start_crop.save(save_path)
self.crop_scs.append(save_path)
image1.close()
return self.crop_scs, self.scs
else:
mid_x, mid_y = sum(frame_x) // len(frame_x), sum(frame_y) // len(frame_y)
margin = self.max_size
x1 = max(0, min(width - margin, mid_x - margin // 2))
y1 = max(0, min(height - margin, mid_y - margin // 2))
x2 = min(x1 + margin, width)
y2 = min(y1 + margin, height)
for idx, each in enumerate(sorted(self.scs)):
image_path = each
image1 = Image.open(image_path).convert('RGB')
file_name_tail = image_path.split('/')[-1].replace('frame_','').replace('.png','')
save_path = image_path.replace(file_name_tail, f'{idx}_crop.jpg')
x1 = max(0, min(width - margin_, mid_x - margin_ // 2))
y1 = max(0, min(height - margin_, mid_y - margin_ // 2))
x2 = min(x1 + margin_, width)
y2 = min(y1 + margin_, height)
start_crop = image1.crop((x1, y1, x2, y2))
start_crop.save(save_path)
self.crop_scs.append(save_path)
image1.close()
return self.crop_scs, self.scs
class Videoscreen_processor:
def __init__(self, vid_path, fps, max_size, delta, check_point_path):
self.vid_path = vid_path
self.fps = fps
self.cursor_model = YOLO(check_point_path)
self.scs = []
self.crop_scs =[]
self.max_size = max_size
self.delta = delta
def sample_from_video(self):
video_path_tail = self.vid_path.split('/')[-1]
cap = cv2.VideoCapture(self.vid_path)
if not cap.isOpened():
print("Error: Could not open video.")
return []
video_fps = cap.get(cv2.CAP_PROP_FPS) # fps
print(video_fps)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
frame_interval = int(video_fps // self.fps)
frame_count = 0
frame_num = 0
while True:
ret, frame = cap.read()
if not ret:
break
if frame_count>1:
break
if frame_num % frame_interval == 0:
frame_count = frame_num // frame_interval
image_path = os.path.join(self.vid_path.replace(video_path_tail, f"frame_{frame_count}.jpg"))
self.scs.append(image_path)
frame_count += 1
cv2.imwrite(image_path, frame)
frame_num += 1
cap.release()
frame_x, frame_y = [], []
for idx, image_path in enumerate(self.scs):
results = self.cursor_model(image_path)
img = Image.open(image_path)
width, height = img.size
img.close()
for result in results:
if result.boxes.xywh.size(0) > 0:
boxes = result.boxes
xywh_tensor = boxes.xywh
x, y = xywh_tensor[0][0].item(), xywh_tensor[0][1].item()
frame_x.append(x)
frame_y.append(y)
else:
print('Cursor not detected')
if len(frame_x) == 0 or len(frame_y) ==0:
self.crop_scs = copy.deepcopy(self.scs)
return self.crop_scs, self.crop_scs
elif (len(frame_x) <= 1) or (max(frame_x)- min(frame_x))>=self.max_size or (max(frame_y)- min(frame_y))>=self.max_size:
print('add margin')
mid_x, mid_y = sum(frame_x) // len(frame_x), sum(frame_y) // len(frame_y)
margin_= self.max_size + self.delta
for idx, each in enumerate(sorted(self.scs)):
image_path = each
image1 = Image.open(image_path).convert('RGB')
file_name_tail = image_path.split('/')[-1]
save_path = image_path.replace(file_name_tail, f'{idx}_crop.jpg')
x1 = max(0, min(width - margin_, mid_x - margin_ // 2))
y1 = max(0, min(height - margin_, mid_y - margin_ // 2))
x2 = min(x1 + margin_, width)
y2 = min(y1 + margin_, height)
start_crop = image1.crop((x1, y1, x2, y2))
start_crop.save(save_path)
self.crop_scs.append(save_path)
image1.close()
return self.crop_scs, self.scs
else:
mid_x, mid_y = sum(frame_x) // len(frame_x), sum(frame_y) // len(frame_y)
margin = self.max_size
x1 = max(0, min(width - margin, mid_x - margin // 2))
y1 = max(0, min(height - margin, mid_y - margin // 2))
x2 = min(x1 + margin, width)
y2 = min(y1 + margin, height)
for idx, each in enumerate(sorted(self.scs)):
image_path = each
image1 = Image.open(image_path).convert('RGB')
file_name_tail = image_path.split('/')[-1].replace('frame_','').replace('.png','')
save_path = image_path.replace(file_name_tail, f'{idx}_crop.jpg')
x1 = max(0, min(width - margin_, mid_x - margin_ // 2))
y1 = max(0, min(height - margin_, mid_y - margin_ // 2))
x2 = min(x1 + margin_, width)
y2 = min(y1 + margin_, height)
start_crop = image1.crop((x1, y1, x2, y2))
start_crop.save(save_path)
self.crop_scs.append(save_path)
image1.close()
return self.crop_scs, self.scs
- Initate the cropping strategy
Cursor_detector = Screenshots_processor('./storage/folder_to_screenshots',512, 128, check_point_path)
cropped_imgs_list, original_imgs_list = Cursor_detector.create_crop()
Inference Example
- Load Model and Prompt Space
"""load model"""
import torch
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
import os
import json
import codecs
import argparse
import random
import re
max_pixels_temp = 160*28*28
max_pixels_narr = 760*28*28
min_pixels_narr = 240*28*28
model = Qwen2VLForConditionalGeneration.from_pretrained(
'FRank62Wu/ShowUI-Narrator', torch_dtype="auto", device_map="cuda"
)
processor = AutoProcessor.from_pretrained('FRank62Wu/ShowUI-Narrator')
processor.tokenizer.pad_token = processor.tokenizer.eos_token
_SYSTEM_PROMPT='For the given video frames of a GUI action, The frames are decribed in the format of <0> to <{N}>.'
_SYSTEM_PROMPT_NARR='''You are an ai assistant to narrate the action of the user for the video frames in the following detail.
'Action': The type of action
'Element': The target of the action
'Source': The starting position (Applicable for action type: Drag)
'Destination': The ending position (Applicable for action type: Drag)
'Purpose': The intended result of the action
The Action include left click, right click, double click, drag, or Keyboard type.
'''
Action_no_reference_grounding = [
'Describe the start frame and the end frame of the action in this video?',
'When Did the action happened in this video? Tell me the start frame and the end frame.',
'Locate the start and the end frame of the action in this video',
"Observe the cursor in this GUI video, marking start and end frame of the action in video frames."
]
Dense_narration_query = ['Narrate the action in the given video.',
'Describe the action of the user in the given frames',
'Describe the action in this video.',
'Narrate the action detail of the user in the video.']
- Round 1: Temporal grounding to detect keyframes. (We take actions from PR as an example)
path_to_data =''
query = _SYSTEM_PROMPT.format(N=9) + ' ' + random.choice(Action_no_reference_grounding)
messages = [
{
'role': 'user',
'content': [
{'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/0_crop.png","max_pixels": max_pixels_temp},
{'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/1_crop.png","max_pixels": max_pixels_temp},
{'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/2_crop.png","max_pixels": max_pixels_temp},
{'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/3_crop.png","max_pixels": max_pixels_temp},
{'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/4_crop.png","max_pixels": max_pixels_temp},
{'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/5_crop.png","max_pixels": max_pixels_temp},
{'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/6_crop.png","max_pixels": max_pixels_temp},
{'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/7_crop.png","max_pixels": max_pixels_temp},
{'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/8_crop.png","max_pixels": max_pixels_temp},
{'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/9_crop.png","max_pixels": max_pixels_temp},
{'type':"text",'text': query},
]
}
]
## round_1 for temporal grounding
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True,
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(output_text)
>>> Output: <6> and <8>
- Round 2: Use selected keyframes for generate captions in JSON format.
# round_2 for dense narration caption
try:
matches = re.search(r"<(\w+)>.*?<(\w+)>", output_text)
s1, e1 = int(matches.group(1)), int(matches.group(2))
except:
s1, e1 =0, 9
query = _SYSTEM_PROMPT_NARR + ' ' + random.choice(Dense_narration_query)
selected_images = []
if e1-s1<=3:
pixels_narr = max_pixels_narr
else:
pixels_narr = min_pixels_narr
for idx, each in enumerate(messages[0]['content']):
if idx >= s1 and idx <= e1:
new_image = each.copy()
new_image['max_pixels'] =pixels_narr
selected_images.append(new_image)
messages = [
{
'role': 'user',
'content':selected_images+ [{'type':"text",'text': query},
]
}
]
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True,
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text_narration = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(output_text_narration)
>>> Output: {"Action": "double click", "Element": "sc2 trans shape button", "Source": null, "Destination": null, "Purpose": " Select the SC2 Trans Shape."}
- Downloads last month
- 1
Inference Providers
NEW
This model is not currently available via any of the supported Inference Providers.
The model cannot be deployed to the HF Inference API:
The model has no library tag.