CogVLM

Running

File size: 3,523 Bytes

6accf0d

import seaborn as sns
from PIL import Image, ImageDraw, ImageFont
import matplotlib.font_manager
import spacy
import re

nlp = spacy.load("en_core_web_sm-3.6.0")

def draw_boxes(image, boxes, texts, output_fn='output.png'):
    box_width = 5
    color_palette = sns.color_palette("husl", len(boxes))
    colors = [(int(r*255), int(g*255), int(b*255)) for r, g, b in color_palette]

    width, height = image.size
    absolute_boxes = [[(int(box[0] * width), int(box[1] * height), int(box[2] * width), int(box[3] * height)) for box in b] for b in boxes]
    
    overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
    draw = ImageDraw.Draw(overlay)
    font_path = sorted(matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext='ttf'))[0]
    font = ImageFont.truetype(font_path, size=26)

    for box, text, color in zip(absolute_boxes, texts, colors):
        for b in box:
            draw.rectangle(b, outline=color, width=box_width)
            if not text:
                continue
            splited_text = text.split('\n')
            num_lines = len(splited_text)
            text_width, text_height = font.getbbox(splited_text[0])[-2:]
            y_start = b[3] - text_height * num_lines - box_width
            if b[2] - b[0] < 100 or b[3] - b[1] < 100:
                y_start = b[3]
            for i, line in enumerate(splited_text):
                text_width, text_height = font.getbbox(line)[-2:]
                x = b[0] + box_width
                y = y_start + text_height * i
                draw.rectangle([x, y, x+text_width, y+text_height], fill=(128, 128, 128, 160))
                draw.text((x, y), line, font=font, fill=(255, 255, 255))
    img_with_overlay = Image.alpha_composite(image.convert('RGBA'), overlay).convert('RGB')
    img_with_overlay.save(output_fn)

def boxstr_to_boxes(box_str):
    boxes = [[int(y)/1000 for y in x.split(',')] for x in box_str.split(';') if x.replace(',', '').isdigit()]
    return boxes

def text_to_dict(text):
    doc = nlp(text)

    box_matches = list(re.finditer(r'\[\[([^\]]+)\]\]', text))
    box_positions = [match.start() for match in box_matches]

    noun_phrases = []
    boxes = []

    for match, box_position in zip(box_matches, box_positions):
        nearest_np_start = max([0] + [chunk.start_char for chunk in doc.noun_chunks if chunk.end_char <= box_position])
        noun_phrase = text[nearest_np_start:box_position].strip()
        if noun_phrase and noun_phrase[-1] == '?':
            noun_phrase = text[:box_position].strip()
        box_string = match.group(1)
        
        noun_phrases.append(noun_phrase)
        boxes.append(boxstr_to_boxes(box_string))

    pairs = []
    for noun_phrase, box_string in zip(noun_phrases, boxes):
        pairs.append((noun_phrase.lower(), box_string))
    return dict(pairs)

def parse_response(img, response, output_fn='output.png'):
    img = img.convert('RGB')
    width, height = img.size
    ratio = min(1920 / width, 1080 / height)
    new_width = int(width * ratio)
    new_height = int(height * ratio)
    new_img = img.resize((new_width, new_height), Image.LANCZOS)
    pattern = r"\[\[(.*?)\]\]"
    positions = re.findall(pattern, response)
    boxes = [[[int(y) for y in x.split(',')] for x in pos.split(';') if x.replace(',', '').isdigit()] for pos in positions]
    dic = text_to_dict(response)
    if not dic:
        texts = []
        boxes = []
    else:
        texts, boxes = zip(*dic.items())
    draw_boxes(new_img, boxes, texts, output_fn=output_fn)