FRank62Wu/ShowUI-Narrator

ShowUI-Narrator is a lightweight (2B) framework to narrate the user's action in GUI video / screenshots built upon YOLO-v8, Qwen2VL and ShowUI.

Quick Start: Import dependencies

pip install -r .requirements.txt

The Overview of Action-Narration Pipeline.

Download Vision Language Model

import torch
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
import os

model = Qwen2VLForConditionalGeneration.from_pretrained(
    'FRank62Wu/ShowUI-Narrator', torch_dtype="auto", device_map="cuda"
)


processor = AutoProcessor.from_pretrained('FRank62Wu/ShowUI-Narrator')   
processor.tokenizer.pad_token = processor.tokenizer.eos_token

Cursor detector Example

Load the detector model and defined class for image cropping

import os
import base64
from PIL import Image
from io import BytesIO
import copy

import cv2
from ultralytics import YOLO

def image_to_base64(img_path):
        with open(img_path, "rb") as img_file:
            encoded_img = base64.b64encode(img_file.read()).decode("utf-8")
        return encoded_img

check_point_path = './ShowUI_Action_Narrator_cursor_detect/best.pt'

class Screenshots_processor:
    def __init__(self, img_path, max_size, delta, check_point_path):
        self.img_path = img_path
        self.cursor_model = YOLO(check_point_path)
        self.scs = []
        self.crop_scs =[]
        self.max_size = max_size
        self.delta = delta
        
    def create_crop(self):
        for each in sorted(os.listdir(self.img_path)):
            if each.endswith('.jsonl') or '_crop' in each:
                continue
            else:
               each = os.path.join(self.img_path, each)
               self.scs.append(each)
        
        frame_x, frame_y = [], []
        for idx, image_path in enumerate(self.scs):
                results = self.cursor_model(image_path)
                img = Image.open(image_path)
                width, height = img.size
                img.close()

                for result in results:
                    if result.boxes.xywh.size(0) > 0:
                        boxes = result.boxes
                        xywh_tensor = boxes.xywh
                        x, y = xywh_tensor[0][0].item(), xywh_tensor[0][1].item()
                        frame_x.append(x)
                        frame_y.append(y)
                    else:
                        print('Cursor not detected')
                        
        if len(frame_x) == 0 or len(frame_y) ==0:
            self.crop_scs = copy.deepcopy(self.scs)
            return self.crop_scs

        elif (len(frame_x) <= 1) or (max(frame_x)- min(frame_x))>=self.max_size or (max(frame_y)- min(frame_y))>=self.max_size:
            print('add margin')
            mid_x, mid_y = sum(frame_x) // len(frame_x), sum(frame_y) // len(frame_y)
            margin_= self.max_size + self.delta
            for idx, each in enumerate(sorted(self.scs)):
                image_path = each
                image1 = Image.open(image_path).convert('RGB')
                file_name_tail = image_path.split('/')[-1]
                save_path = image_path.replace(file_name_tail, f'{idx}_crop.jpg')

                x1 = max(0, min(width - margin_, mid_x - margin_ // 2))
                y1 = max(0, min(height - margin_, mid_y - margin_ // 2))
                x2 = min(x1 + margin_, width)
                y2 = min(y1 + margin_, height)
                start_crop = image1.crop((x1, y1, x2, y2))

                start_crop.save(save_path)
                self.crop_scs.append(save_path)
                image1.close()
                return self.crop_scs, self.scs

        else: 
            mid_x, mid_y = sum(frame_x) // len(frame_x), sum(frame_y) // len(frame_y)
            margin = self.max_size
            x1 = max(0, min(width - margin, mid_x - margin // 2))
            y1 = max(0, min(height - margin, mid_y - margin // 2))
            x2 = min(x1 + margin, width)
            y2 = min(y1 + margin, height)
            for idx, each in enumerate(sorted(self.scs)):
                image_path = each
                image1 = Image.open(image_path).convert('RGB')
                file_name_tail = image_path.split('/')[-1].replace('frame_','').replace('.png','')
                save_path = image_path.replace(file_name_tail, f'{idx}_crop.jpg')

                x1 = max(0, min(width - margin_, mid_x - margin_ // 2))
                y1 = max(0, min(height - margin_, mid_y - margin_ // 2))
                x2 = min(x1 + margin_, width)
                y2 = min(y1 + margin_, height)
                start_crop = image1.crop((x1, y1, x2, y2))

                start_crop.save(save_path)
                self.crop_scs.append(save_path)
                image1.close()
                return self.crop_scs, self.scs



class Videoscreen_processor:
    def __init__(self, vid_path, fps, max_size, delta, check_point_path):
        self.vid_path = vid_path
        self.fps = fps
        self.cursor_model = YOLO(check_point_path)
        self.scs = []
        self.crop_scs =[]
        self.max_size = max_size
        self.delta = delta
        
        
        
    def sample_from_video(self):
        
        video_path_tail = self.vid_path.split('/')[-1]
        cap = cv2.VideoCapture(self.vid_path)
        if not cap.isOpened():
            print("Error: Could not open video.")
            return []
        video_fps = cap.get(cv2.CAP_PROP_FPS)  # fps
        print(video_fps)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) 
        frame_interval = int(video_fps // self.fps)  
        frame_count = 0 
        frame_num = 0
        while True:
            ret, frame = cap.read()
            if not ret:
                break  
            if frame_count>1:
                break
            if frame_num % frame_interval == 0:
                frame_count = frame_num // frame_interval
                image_path = os.path.join(self.vid_path.replace(video_path_tail, f"frame_{frame_count}.jpg"))
                self.scs.append(image_path)
                frame_count += 1
                cv2.imwrite(image_path, frame)
            frame_num += 1
        cap.release()  
        
        frame_x, frame_y = [], []
        for idx, image_path in enumerate(self.scs):
                results = self.cursor_model(image_path)
                img = Image.open(image_path)
                width, height = img.size
                img.close()

                for result in results:
                    if result.boxes.xywh.size(0) > 0:
                        boxes = result.boxes
                        xywh_tensor = boxes.xywh
                        x, y = xywh_tensor[0][0].item(), xywh_tensor[0][1].item()
                        frame_x.append(x)
                        frame_y.append(y)
                    else:
                        print('Cursor not detected')
                        
        if len(frame_x) == 0 or len(frame_y) ==0:
            self.crop_scs = copy.deepcopy(self.scs)
            return self.crop_scs, self.crop_scs

        elif (len(frame_x) <= 1) or (max(frame_x)- min(frame_x))>=self.max_size or (max(frame_y)- min(frame_y))>=self.max_size:
            print('add margin')
            mid_x, mid_y = sum(frame_x) // len(frame_x), sum(frame_y) // len(frame_y)
            margin_= self.max_size + self.delta
            for idx, each in enumerate(sorted(self.scs)):
                image_path = each
                image1 = Image.open(image_path).convert('RGB')
                file_name_tail = image_path.split('/')[-1]
                save_path = image_path.replace(file_name_tail, f'{idx}_crop.jpg')

                x1 = max(0, min(width - margin_, mid_x - margin_ // 2))
                y1 = max(0, min(height - margin_, mid_y - margin_ // 2))
                x2 = min(x1 + margin_, width)
                y2 = min(y1 + margin_, height)
                start_crop = image1.crop((x1, y1, x2, y2))

                start_crop.save(save_path)
                self.crop_scs.append(save_path)
                image1.close()
                return self.crop_scs, self.scs

        else: 
            mid_x, mid_y = sum(frame_x) // len(frame_x), sum(frame_y) // len(frame_y)
            margin = self.max_size
            x1 = max(0, min(width - margin, mid_x - margin // 2))
            y1 = max(0, min(height - margin, mid_y - margin // 2))
            x2 = min(x1 + margin, width)
            y2 = min(y1 + margin, height)
            for idx, each in enumerate(sorted(self.scs)):
                image_path = each
                image1 = Image.open(image_path).convert('RGB')
                file_name_tail = image_path.split('/')[-1].replace('frame_','').replace('.png','')
                save_path = image_path.replace(file_name_tail, f'{idx}_crop.jpg')

                x1 = max(0, min(width - margin_, mid_x - margin_ // 2))
                y1 = max(0, min(height - margin_, mid_y - margin_ // 2))
                x2 = min(x1 + margin_, width)
                y2 = min(y1 + margin_, height)
                start_crop = image1.crop((x1, y1, x2, y2))

                start_crop.save(save_path)
                self.crop_scs.append(save_path)
                image1.close()
                return self.crop_scs, self.scs

Initate the cropping strategy

Cursor_detector = Screenshots_processor('./storage/folder_to_screenshots',512, 128, check_point_path)

cropped_imgs_list, original_imgs_list = Cursor_detector.create_crop()

Inference Example

Load Model and Prompt Space

"""load model"""
import torch
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
import os
import json
import codecs 
import argparse
import random 
import re


max_pixels_temp = 160*28*28
max_pixels_narr = 760*28*28
min_pixels_narr = 240*28*28




model = Qwen2VLForConditionalGeneration.from_pretrained(
    'FRank62Wu/ShowUI-Narrator', torch_dtype="auto", device_map="cuda"
)


processor = AutoProcessor.from_pretrained('FRank62Wu/ShowUI-Narrator')   
processor.tokenizer.pad_token = processor.tokenizer.eos_token


_SYSTEM_PROMPT='For the given video frames of a GUI action, The frames are decribed in the format of <0> to <{N}>.'



_SYSTEM_PROMPT_NARR='''You are an ai assistant to narrate the action of the user for the video frames in the following detail.
'Action': The type of action
'Element': The target of the action
'Source': The starting position (Applicable for action type: Drag)
'Destination': The ending position (Applicable for action type: Drag)
'Purpose': The intended result of the action
The Action include left click, right click, double click, drag, or Keyboard type.
'''


Action_no_reference_grounding = [
     'Describe the start frame and the end frame of the action in this video?',
     'When Did the action happened in this video? Tell me the start frame and the end frame.',
     'Locate the start and the end frame of the action in this video',
     "Observe the cursor in this GUI video, marking start and end frame of the action in video frames."
]


Dense_narration_query = ['Narrate the action in the given video.',
                         'Describe the action of the user in the given frames',
                         'Describe the action in this video.',
                         'Narrate the action detail of the user in the video.']

Round 1: Temporal grounding to detect keyframes. (We take actions from PR as an example)

path_to_data =''

query = _SYSTEM_PROMPT.format(N=9) + ' ' + random.choice(Action_no_reference_grounding)
messages = [
        {
            'role': 'user', 
            'content': [
                        {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/0_crop.png","max_pixels": max_pixels_temp},
                        {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/1_crop.png","max_pixels": max_pixels_temp},
                        {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/2_crop.png","max_pixels": max_pixels_temp},
                        {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/3_crop.png","max_pixels": max_pixels_temp},
                        {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/4_crop.png","max_pixels": max_pixels_temp},
                        {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/5_crop.png","max_pixels": max_pixels_temp},
                        {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/6_crop.png","max_pixels": max_pixels_temp},
                        {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/7_crop.png","max_pixels": max_pixels_temp},
                        {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/8_crop.png","max_pixels": max_pixels_temp},
                        {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/9_crop.png","max_pixels": max_pixels_temp},
                        {'type':"text",'text': query},
                        ]
        }   
    ]



## round_1 for temporal grounding
text = processor.apply_chat_template(
                            messages, tokenize=False, add_generation_prompt=True,
                        )
          
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(output_text)

>>> Output: <6> and <8>

ShowUI ShowUI

Round 2: Use selected keyframes for generate captions in JSON format.

# round_2 for dense narration caption
try:
    matches = re.search(r"<(\w+)>.*?<(\w+)>", output_text)
    s1, e1 = int(matches.group(1)), int(matches.group(2))
except:
    s1, e1 =0, 9
    

query = _SYSTEM_PROMPT_NARR + ' ' + random.choice(Dense_narration_query)

selected_images = []

if e1-s1<=3:
    pixels_narr = max_pixels_narr
else:
    pixels_narr = min_pixels_narr
    
    
for idx, each in enumerate(messages[0]['content']):
        if idx >= s1 and idx <= e1:
            new_image = each.copy()
            new_image['max_pixels'] =pixels_narr
            selected_images.append(new_image)
            
            
messages = [
        {
            'role': 'user', 
            'content':selected_images+ [{'type':"text",'text': query},
                        ] 
        }   
    ]

text = processor.apply_chat_template(
                            messages, tokenize=False, add_generation_prompt=True,
                        )
          
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text_narration = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(output_text_narration)

>>> Output: {"Action": "double click", "Element": "sc2 trans shape button", "Source": null, "Destination": null, "Purpose": " Select the SC2 Trans Shape."}

FRank62Wu
/

ShowUI-Narrator

Quick Start: Import dependencies

The Overview of Action-Narration Pipeline.

Download Vision Language Model

Cursor detector Example

Inference Example

Model tree for FRank62Wu/ShowUI-Narrator

Dataset used to train FRank62Wu/ShowUI-Narrator