Point output issues wit Molmo-72B

#19
by pltrtt - opened

First of all, thank you for sharing this amazing model to the public.

After playing with the model for a while, Molmo-7B-D-0924 works as expected, but the bigger model, Molmo-72B-0924 struggles to output accurate points in my end. I am using 8 A100 40GB GPUs. The points output from Molmo-72B sometimes look good, but they can be either on top of the objects or totally random. Here are some examples:

Prompt Molmo-7B-D Molmo-72B
Point out airplanes in this image airplane-1.jpg airplane-1_72b.jpg
Point out cars in this image car_molmo-7B_0.png car_molmo-72B_0.png

Here are a minimal version of code to repreduce the error:

from vllm import LLM, SamplingParams
import PIL
import re
import numpy as np
import cv2
from urllib.request import urlopen
import io

llm = LLM(model='allenai/Molmo-7B-D-0924', 
          trust_remote_code=True, 
          tensor_parallel_size=4)

# llm = LLM(model='allenai/Molmo-72B-0924', 
#           trust_remote_code=True, 
#           tensor_parallel_size=8)

sampling_params = SamplingParams(max_tokens=1000)

# Refer to the HuggingFace repo for the correct format to use
prompt = "Point out all cars in the image"

img_urls = [
    "https://www.shutterstock.com/shutterstock/videos/1110804299/thumb/1.jpg"
]
def extract_points(molmo_output, image_w, image_h):
    all_points = []
    for match in re.finditer(r'x\d*="\s*([0-9]+(?:\.[0-9]+)?)"\s+y\d*="\s*([0-9]+(?:\.[0-9]+)?)"', molmo_output):
        try:
            point = [float(match.group(i)) for i in range(1, 3)]
        except ValueError:
            pass
        else:
            point = np.array(point)
            if np.max(point) > 100:
                # Treat as an invalid output
                continue
            point /= 100.0
            point = point * np.array([image_w, image_h])
            point = point.astype(np.int32)
            all_points.append(point)
    return all_points

for idx, url in enumerate(img_urls):

    # Load the image using img_url
    image_pil = PIL.Image.open(urlopen(url))
    img_rgb = np.array(image_pil)

    # Single prompt inference
    outputs = llm.generate({
        "prompt": prompt,
        "multi_modal_data": {"image": image_pil},
    }, sampling_params=sampling_params)

    for o in outputs:
        generated_text = o.outputs[0].text
        
    print(generated_text)

    points = extract_points(generated_text, img_rgb.shape[1], img_rgb.shape[0])
    print(points)
    # Draw the points on the image
    for point in points:
        cv2.circle(img_rgb, point, 5, (0, 255, 0), -1)
        
    # Write the answer on the top left corner
    cv2.putText(img_rgb, generated_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    # Save the image with points 
    img_path = f'car_molmo-7B_{idx}.png'
    img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)

    cv2.imwrite(img_path, img_bgr)

Could you help me examine my code to see if there is adjustable to inference the 72B version? Any feedback is well appreciated.

Your need to confirm your account before you can post a new comment.

Sign up or log in to comment