Harmon: Harmonizing Visual Representations for Unified Multimodal Understanding and Generation
Harmonizing Visual Representations for Unified Multimodal Understanding and Generation
Size Wu, Wenwei Zhang, Lumin Xu, Sheng Jin, Zhonghua Wu, Qingyi Tao, Wentao Liu, Wei Li, Chen Change Loy
Introduction
Harmon is a novel unified framework for multimodal understanding and generation. Unlike existing state-of-the-art architectures that disentangle visual understanding and generation with different encoder models, the proposed framework harmonizes the visual presentations of understanding and generation via a shared MAR encoder. Harmon achieves advanced generation performance on mainstream text-to-image generation benchmarks, and exhibits competitive results on multimodal understanding tasks. In this repo, we provide inference code to run Harmon for image understanding (image-to-text) and text-to-image generation, with two model variants Harmon-0.5B and Harmon-1.5B.
Model Variant | LLM | MAR | Hugging Face Hub |
---|---|---|---|
Harmon-0.5B | Qwen2.5-0.5B-Instruct | MAR-Base | |
Harmon-1.5B | Qwen2.5-1.5B-Instruct | MAR-Huge |
Usage
ποΈ Image-to-text Generation
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from einops import rearrange
from PIL import Image
import requests
PROMPT_TEMPLATE = dict(
SYSTEM='<|im_start|>system\n{system}<|im_end|>\n',
INSTRUCTION='<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n',
SUFFIX='<|im_end|>',
SUFFIX_AS_EOS=True,
SEP='\n',
STOP_WORDS=['<|im_end|>', '<|endoftext|>'])
def expand2square(pil_img, background_color):
width, height = pil_img.size
if width == height:
return pil_img
elif width > height:
result = Image.new(pil_img.mode, (width, width), background_color)
result.paste(pil_img, (0, (width - height) // 2))
return result
else:
result = Image.new(pil_img.mode, (height, height), background_color)
result.paste(pil_img, ((height - width) // 2, 0))
return result
@torch.no_grad()
def question_answer(question,
image,
model,
tokenizer,
max_new_tokens=512,
image_size=512
):
assert image_size == 512
image = expand2square(
image, (127, 127, 127))
image = image.resize(size=(image_size, image_size))
image = torch.from_numpy(np.array(image)).to(dtype=model.dtype, device=model.device)
image = rearrange(image, 'h w c -> c h w')[None]
image = 2 * (image / 255) - 1
prompt = PROMPT_TEMPLATE['INSTRUCTION'].format(input="<image>\n" + question)
assert '<image>' in prompt
image_length = (image_size // 16) ** 2 + model.mar.buffer_size
prompt = prompt.replace('<image>', '<image>'*image_length)
input_ids = tokenizer.encode(
prompt, add_special_tokens=True, return_tensors='pt').cuda()
_, z_enc = model.extract_visual_feature(model.encode(image))
inputs_embeds = z_enc.new_zeros(*input_ids.shape, model.llm.config.hidden_size)
inputs_embeds[input_ids == image_token_idx] = z_enc.flatten(0, 1)
inputs_embeds[input_ids != image_token_idx] = model.llm.get_input_embeddings()(
input_ids[input_ids != image_token_idx]
)
output = model.llm.generate(inputs_embeds=inputs_embeds,
use_cache=True,
do_sample=False,
max_new_tokens=max_new_tokens,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id
if tokenizer.pad_token_id is not None else
tokenizer.eos_token_id
)
return tokenizer.decode(output[0])
harmon_tokenizer = AutoTokenizer.from_pretrained("wusize/Harmon-0_5B",
trust_remote_code=True)
harmon_model = AutoModel.from_pretrained("wusize/Harmon-0_5B",
trust_remote_code=True).eval().cuda().bfloat16()
special_tokens_dict = {'additional_special_tokens': ["<image>", ]}
num_added_toks = harmon_tokenizer.add_special_tokens(special_tokens_dict)
assert num_added_toks == 1
image_token_idx = harmon_tokenizer.encode("<image>", add_special_tokens=False)[-1]
print(f"Image token: {harmon_tokenizer.decode(image_token_idx)}")
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
raw_image = Image.open(requests.get(image_file, stream=True).raw).convert('RGB')
output_text = question_answer(question='Describe the image in detail.',
image=raw_image,
model=harmon_model,
tokenizer=harmon_tokenizer,
)
print(output_text)
πΌοΈ Text-to-image Generation
import os
import torch
from transformers import AutoTokenizer, AutoModel
from einops import rearrange
from PIL import Image
PROMPT_TEMPLATE = dict(
SYSTEM='<|im_start|>system\n{system}<|im_end|>\n',
INSTRUCTION='<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n',
SUFFIX='<|im_end|>',
SUFFIX_AS_EOS=True,
SEP='\n',
STOP_WORDS=['<|im_end|>', '<|endoftext|>'])
GENERATION_TEMPLATE = "Generate an image: {text}"
@torch.no_grad()
def generate_images(prompts,
negative_prompt,
tokenizer,
model,
output,
grid_size=2, # will produce 2 x 2 images per prompt
num_steps=64, cfg_scale=3.0, temperature=1.0, image_size=512):
assert image_size == 512
m = n = image_size // 16
prompts = [
PROMPT_TEMPLATE['INSTRUCTION'].format(input=prompt)
for prompt in prompts
] * (grid_size ** 2)
if cfg_scale != 1.0:
prompts += [PROMPT_TEMPLATE['INSTRUCTION'].format(input=negative_prompt)] * len(prompts)
inputs = tokenizer(
prompts, add_special_tokens=True, return_tensors='pt', padding=True).to(model.device)
images = model.sample(**inputs, num_iter=num_steps, cfg=cfg_scale, cfg_schedule="constant",
temperature=temperature, progress=True, image_shape=(m, n))
images = rearrange(images, '(m n b) c h w -> b (m h) (n w) c', m=grid_size, n=grid_size)
images = torch.clamp(
127.5 * images + 128.0, 0, 255).to("cpu", dtype=torch.uint8).numpy()
os.makedirs(output, exist_ok=True)
for idx, image in enumerate(images):
Image.fromarray(image).save(f"{output}/{idx:08d}.jpg")
harmon_tokenizer = AutoTokenizer.from_pretrained("wusize/Harmon-0_5B",
trust_remote_code=True)
harmon_model = AutoModel.from_pretrained("wusize/Harmon-0_5B",
trust_remote_code=True).cuda().bfloat16().eval()
texts = ['a dog on the left and a cat on the right.',
'a photo of a pink stop sign.']
pos_prompts = [GENERATION_TEMPLATE.format(text=text) for text in texts]
neg_prompt = 'Generate an image.' # for classifier-free guidance
generate_images(prompts=pos_prompts,
negative_prompt=neg_prompt,
tokenizer=harmon_tokenizer,
model=harmon_model,
output='output',)
π Citation
If you find Harmon useful for your research or applications, please cite our paper using the following BibTeX:
@misc{wu2025harmon,
title={Harmonizing Visual Representations for Unified Multimodal Understanding and Generation},
author={Size Wu and Wenwei Zhang and Lumin Xu and Sheng Jin and Zhonghua Wu and Qingyi Tao and Wentao Liu and Wei Li and Chen Change Loy},
year={2025},
eprint={2503.21979},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2503.21979},
}
π License
This project is licensed under NTU S-Lab License 1.0.
- Downloads last month
- 8