Spaces:

szukevin
/

VISOR-GPT

Runtime error

File size: 13,521 Bytes

7900c16


"""
decode sequential output to visual locations
author: sierkinhane.github.io
"""
import random
from tqdm import tqdm
import json
import numpy as np
import re
import argparse
import cv2
import math
import os

# COCO keypoints
stickwidth = 4

limbSeq_coco = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
               [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
               [1, 16], [16, 18], [3, 17], [6, 18]]

limbSeq_cp = [[14, 2], [14, 1], [2, 4], [4, 6], [1, 3], [3, 5], [14, 8], [8, 10], [10, 12], [14, 7], [7, 9], [9, 11], [13, 14]]

# CrowdPose
# {'0': 'left shoulder', '1': 'right shoulder', '2': 'left elbow', '3': 'right elbow', '4': 'left wrist', '5': 'right wrist', '6': 'left hip', '7': 'right hip', '8': 'left knee', '9': 'right knee', '10': 'left ankle', '11': 'right ankle', '12': 'head', '13': 'neck'}

# for human pose visualization
colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
          [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
          [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]

# for box visualization
colors_box = [[217, 221, 116], [137, 165, 171], [230, 126, 175], [63, 157, 5], [107, 51, 75], [217, 147, 152], [129, 132, 8], [232, 85, 249], [254, 98, 33], [89, 108, 230], [253, 34, 161], [91, 150, 30], [255, 147, 26], [209, 154, 205], [134, 57, 11], [143, 181, 122], [241, 176, 87], [104, 73, 26], [122, 147, 59], [235, 230, 229], [119, 18, 125], [185, 61, 138], [237, 115, 90], [13, 209, 111], [219, 172, 212]]

# Plots one bounding box on image
def plot_one_box(x, img, color=None, label=None, line_thickness=None, idx=0):
     tl = line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 # line thickness
     color = color or [random.randint(0, 255) for _ in range(3)]
     color = colors_box[idx]
     c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
     cv2.rectangle(img, c1, c2, color, thickness=tl)
     if label:
        tf = max(tl - 1, 1) # font thickness
     t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
     c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
     cv2.rectangle(img, c1, c2, color, -1) # filled
     cv2.putText(img, label, c1, 0, tl / 3, [0, 0, 0], thickness=tf, lineType=cv2.LINE_AA)
     return img


# decode one sequence to visual locations
def decode(coordinate_str, type='box'):

    # find numbers
    locations = np.array([int(i) for i in re.findall(r"\d+", coordinate_str)])

    if type == 'box':
        locations = locations.reshape(-1, 4)
    elif type == 'cocokeypoint':
        locations = locations.reshape(-1, 18, 2)
        visible = np.ones((locations.shape[0], 18, 1))
        eq_0_idx = np.where(locations[:, :, 0] * locations[:, :, 1] == 0)
        visible[eq_0_idx] = 0
        locations = np.concatenate([locations, visible], axis=-1)
        for i in range(locations.shape[0]):
            if locations[i, 2, -1] == 0 or locations[i, 5, -1] == 0:
                locations[i, 1, -1] = 0
    elif type == 'crowdpose':
        locations = locations.reshape(-1, 14, 2)
        visible = np.ones((locations.shape[0], 14, 1))
        eq_0_idx = np.where(locations[:, :, 0] * locations[:, :, 1] == 0)
        visible[eq_0_idx] = 0
        locations = np.concatenate([locations, visible], axis=-1)
    elif type == 'mask':
        locations = []
        for c_str in coordinate_str.split('m0'):
            c_str = ''.join(re.split(r'm\d+', c_str))
            mask_coord = np.array([int(i) for i in re.findall(r"\d+ ", c_str)])
            if len(mask_coord) != 0:
                locations.append(mask_coord.reshape(-1, 1, 2))
    else:
        raise NotImplementedError

    return locations


# process raw sequences inferred by VisorGPT
def to_coordinate(file_path, ctn=True):

    if isinstance(file_path, list):
        texts = [i.strip().replace(' ##', '') for i in file_path]
    else:
        with open(file_path, 'r') as file:
            texts = [i.strip().replace(' ##', '') for i in file.readlines()]

    location_list = []
    classname_list = []
    type_list = []
    valid_sequences = []
    cnt = 0
    print('to coordinate ...')

    for ste in tqdm(texts):
        cnt += 1
        if 'box' in ste:
            type = 'box'
        elif 'key point' in ste:
            type = 'cocokeypoint' if '; 18 ;' in ste else 'crowdpose'
        elif 'mask' in ste:
            type = 'mask'
        else:
            raise NotImplementedError

        if '[SEP]' not in ste:
            continue

        try:
            if ctn:
                temp = ste[:ste.index('[SEP]')].split(' ; ')[5].split('] ')
                classnames = []
                for t in temp:
                    classnames.append(t.split(' xmin ')[0].split(' m0')[0][2:])
                classnames = classnames[:-1]
                locations = decode(ste[:ste.index('[SEP]')].split(' ; ')[5], type=type)

            else:
                classnames = ste[:ste.index('[SEP]')].split(' ; ')[5].split(' , ')
                locations = decode(ste[:ste.index('[SEP]')].split(' ; ')[6], type=type)
        except:
            pass
        else:
            valid_sequences.append(ste[:ste.index('[SEP]')])
            location_list.append(locations)
            classname_list.append(classnames)
            type_list.append(type)

    with open('valid_sequences.txt', 'w') as file:
        [file.write(i.split('[CLS] ')[-1] + '\n') for i in valid_sequences]

    return location_list, classname_list, type_list, valid_sequences

# visualize object locations on a canvas
def visualization(location_list, classname_list, type_list, save_dir='debug/', save_fig=False):

    if save_fig:
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

    print('visualizing ...')
    for b, (loc, classnames, type) in tqdm(enumerate(zip(location_list, classname_list, type_list))):
        canvas = np.zeros((512, 512, 3), dtype=np.uint8) + 50

        if len(loc) != len(classnames):
            continue
        
        if type == 'box':
            for i in range(loc.shape[0]):
                canvas = plot_one_box(loc[i], canvas, label=classnames[i], idx=i)

        elif type == 'cocokeypoint':
            for i in range(loc.shape[0]):
                for j in range(loc.shape[1]):
                    x, y, v = loc[i, j]
                    if v != 0:
                        cv2.circle(canvas, (int(x), int(y)), 4, colors[j], thickness=-1)
                for j in range(17):
                    lim = limbSeq_coco[j]
                    cur_canvas = canvas.copy()

                    Y = [loc[i][lim[0] - 1][0], loc[i][lim[1] - 1][0]]
                    X = [loc[i][lim[0] - 1][1], loc[i][lim[1] - 1][1]]

                    if loc[i][lim[0] - 1][-1] == 0 or loc[i][lim[1] - 1][-1] == 0:
                        continue

                    mX = np.mean(X)
                    mY = np.mean(Y)
                    length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
                    angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
                    polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
                    cv2.fillConvexPoly(cur_canvas, polygon, colors[j])
                    canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)

        elif type == 'crowdpose':
            for i in range(loc.shape[0]):
                for j in range(loc.shape[1]):
                    x, y, _ = loc[i, j]
                    if x != 0 and y != 0:
                        cv2.circle(canvas, (int(x), int(y)), 4, colors[j], thickness=-1)
                for j in range(13):
                    lim = limbSeq_cp[j]
                    cur_canvas = canvas.copy()

                    Y = [loc[i][lim[0] - 1][0], loc[i][lim[1] - 1][0]]
                    X = [loc[i][lim[0] - 1][1], loc[i][lim[1] - 1][1]]

                    if (Y[0] == 0 and X[0] == 0) or (Y[1] == 0 and X[1] == 0):
                        continue

                    mX = np.mean(X)
                    mY = np.mean(Y)
                    length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
                    angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
                    polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
                    cv2.fillConvexPoly(cur_canvas, polygon, colors[j])
                    canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)

        elif type == 'mask':
            for i in range(len(loc)):
                color = [random.randint(0, 255) for _ in range(3)]
                xmin, ymin, xmax, ymax = loc[i][:, :, 0].min(), loc[i][:, :, 1].min(), loc[i][:, :, 0].max(), loc[i][:, :, 1].max()
                cur_canvas = canvas.copy()
                cv2.fillPoly(cur_canvas, [loc[i]], color)
                cur_canvas = plot_one_box((xmin, ymin, xmax, ymax), cur_canvas, color=color, label=classnames[i])
                canvas = cv2.addWeighted(canvas, 0.5, cur_canvas, 0.5, 0)
        else:
            raise NotImplementedError
        if save_fig:
            cv2.imwrite(f'{save_dir}/test_{b}.png', canvas[..., ::-1])
            
    return canvas[..., ::-1]

# to json output
def to_json(location_list, classname_list, type_list, valid_sequences):

    ret_json_box = {'bboxes': [], 'sequences': []}
    ret_json_mask = {'masks': [], 'sequences': []}
    ret_json_keypoint = {'keypoints': [], 'sequences': []}
    print('to json ...')
    for loc, classnames, type, seq in tqdm(zip(location_list, classname_list, type_list, valid_sequences)):
        ins_list = []
        kpt_list = []
        mask_list = []
        seq_list = []
        if len(loc) != len(classnames):# or len(classnames) > 8:
            continue

        if type == 'box':
            for i in range(loc.shape[0]):
                # xmin, ymin, xmax, ymax = loc[i]
                # area = (xmax - xmin) * (ymax - ymin)
                # compute area and omit very small one due to the synthesis ability of AIGC
                # if area < 32**2:
                #     continue

                dic = {classnames[i]: loc[i].tolist()}
                ins_list.append(dic)
                if len(seq_list) == 0:
                    seq_list.append(seq)

        elif type == 'cocokeypoint' or type == 'crowdpose':
            for i in range(loc.shape[0]):
                # compute validate key points and omit the less one, as the synthesis ability of AIGC
                # if loc[i, :, -1].sum() <= 4:
                #     continue

                # compute area and omit very small one due to the synthesis ability of AIGC
                # xmin, ymin, xmax, ymax = loc[i, :, 0].min(), loc[i, :, 1].min(), loc[i, :, 0].max(), loc[i, :, 1].max()
                # area = (xmax - xmin) * (ymax - ymin)
                # if area < 32 ** 2:
                #     continue

                dic = {classnames[i]: loc[i][:, :].tolist()}
                kpt_list.append(dic)
                if len(seq_list) == 0:
                    seq_list.append(seq)

        elif type == 'mask':
            for i in range(len(loc)):

                # xmin, ymin, xmax, ymax = loc[i][:, :, 0].min(), loc[i][:, :, 1].min(), loc[i][:, :, 0].max(), loc[i][:, :, 1].max()
                # area = (xmax - xmin) * (ymax - ymin)
                # if area < 32 ** 2:
                #     continue

                dic = {classnames[i]: loc[i].tolist()}
                mask_list.append(dic)
                if len(seq_list) == 0:
                    seq_list.append(seq)
        else:
            raise NotImplementedError

        if len(ins_list) != 0:
            ret_json_box['bboxes'].append(ins_list)
            ret_json_box['sequences'].append(seq_list)
        if len(kpt_list) != 0:
            ret_json_keypoint['keypoints'].append(kpt_list)
            ret_json_keypoint['sequences'].append(seq_list)
        if len(mask_list) != 0:
            ret_json_mask['masks'].append(mask_list)
            ret_json_mask['sequences'].append(seq_list)

    return [ret_json_box, ret_json_mask, ret_json_keypoint]


def gen_cond_mask(texts, ctn):
    location_list, classname_list, type_list, valid_sequences = to_coordinate(texts, ctn)
    ret_mask = visualization(location_list, classname_list, type_list, None, False)
    ret_json = to_json(location_list, classname_list, type_list, valid_sequences)
    return ret_mask, ret_json

if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('--file_path', type=str, required=True)
    parser.add_argument('--save_dir', type=str, default='debug')
    parser.add_argument('--visualize', type=bool, default=False)
    args = parser.parse_args()

    location_list, classname_list, type_list, valid_sequences = to_coordinate(args.file_path)

    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)

    # visualization
    if args.visualize:
        visualization(location_list, classname_list, type_list, args.save_dir)

    # to json data
    rets = to_json(location_list, classname_list, type_list, valid_sequences)

    for ret, flag in zip(rets, ['box', 'mask', 'keypoint']):
        save_path = args.file_path.split('/')[-1].split('.')[0] + f'_{flag}.json'
        with open('files/' + save_path, 'w') as file:
            json.dump(ret, file, indent=2)