Point output issues wit Molmo-72B
- opened
First of all, thank you for sharing this amazing model to the public.
After playing with the model for a while, Molmo-7B-D-0924 works as expected, but the bigger model, Molmo-72B-0924 struggles to output accurate points in my end. I am using 8 A100 40GB GPUs. The points output from Molmo-72B sometimes look good, but they can be either on top of the objects or totally random. Here are some examples:
Here are a minimal version of code to repreduce the error:
from vllm import LLM, SamplingParams
import PIL
import re
import numpy as np
import cv2
from urllib.request import urlopen
import io
llm = LLM(model='allenai/Molmo-7B-D-0924',
# llm = LLM(model='allenai/Molmo-72B-0924',
# trust_remote_code=True,
# tensor_parallel_size=8)
sampling_params = SamplingParams(max_tokens=1000)
# Refer to the HuggingFace repo for the correct format to use
prompt = "Point out all cars in the image"
img_urls = [
def extract_points(molmo_output, image_w, image_h):
all_points = []
for match in re.finditer(r'x\d*="\s*([0-9]+(?:\.[0-9]+)?)"\s+y\d*="\s*([0-9]+(?:\.[0-9]+)?)"', molmo_output):
point = [float(match.group(i)) for i in range(1, 3)]
except ValueError:
point = np.array(point)
if np.max(point) > 100:
# Treat as an invalid output
point /= 100.0
point = point * np.array([image_w, image_h])
point = point.astype(np.int32)
return all_points
for idx, url in enumerate(img_urls):
# Load the image using img_url
image_pil = PIL.Image.open(urlopen(url))
img_rgb = np.array(image_pil)
# Single prompt inference
outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": {"image": image_pil},
}, sampling_params=sampling_params)
for o in outputs:
generated_text = o.outputs[0].text
points = extract_points(generated_text, img_rgb.shape[1], img_rgb.shape[0])
# Draw the points on the image
for point in points:
cv2.circle(img_rgb, point, 5, (0, 255, 0), -1)
# Write the answer on the top left corner
cv2.putText(img_rgb, generated_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
# Save the image with points
img_path = f'car_molmo-7B_{idx}.png'
img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
cv2.imwrite(img_path, img_bgr)
Could you help me examine my code to see if there is adjustable to inference the 72B version? Any feedback is well appreciated.