metadata
language:
- en
Emu2-Chat
Paper | 🤗HF Demo | Demo | Project Page | Github
Model Weights
Inference (Huggingface Version)
Single GPU
from PIL import Image
import requests
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("BAAI/Emu2-Chat")
model = AutoModelForCausalLM.from_pretrained(
"BAAI/Emu2-Chat",
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
trust_remote_code=True).to('cuda').eval()
# `[<IMG_PLH>]` is the image placeholder which will be replaced by image embeddings.
# the number of `[<IMG_PLH>]` should be equal to the number of input images
query = '[<IMG_PLH>]Describe the image in details:'
image = Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/blue_black_1_top_left.jpg?raw=true',stream=True).raw).convert('RGB')
inputs = model.build_input_ids(
text=[query],
tokenizer=tokenizer,
image=[image]
)
with torch.no_grad():
outputs = model.generate(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
image=inputs["image"].to(torch.bfloat16),
max_new_tokens=64,
length_penalty=-1)
output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
Interleaved image and text
from PIL import Image
import requests
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("BAAI/Emu2-Chat")
model = AutoModelForCausalLM.from_pretrained(
"BAAI/Emu2-Chat",
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
trust_remote_code=True).to('cuda').eval()
# `[<IMG_PLH>]` is the image placeholder which will be replaced by image embeddings.
# the number of `[<IMG_PLH>]` should be equal to the number of input images
query = "[<IMG_PLH>][red, white, 3, bottom left].[<IMG_PLH>][yellow, white, 2, top left].[<IMG_PLH>][green, black, 4, bottom right][<IMG_PLH>]"
images = [
Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/red_white_3_bottom_left.jpg?raw=true',stream=True).raw).convert('RGB'),
Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/yellow_white_2_top_right.jpg?raw=true',stream=True).raw).convert('RGB'),
Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/green_black_4_bottom_right.jpg?raw=true',stream=True).raw).convert('RGB'),
Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/blue_black_1_top_left.jpg?raw=true',stream=True).raw).convert('RGB'),
]
inputs = model.build_input_ids(
text=[query],
tokenizer=tokenizer,
image=images
)
with torch.no_grad():
outputs = model.generate(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
image=inputs["image"].to(torch.bfloat16),
max_new_tokens=64,
length_penalty=-1)
output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
Multi GPU
from PIL import Image
import requests
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch
tokenizer = AutoTokenizer.from_pretrained("BAAI/Emu2-Chat")
with init_empty_weights():
model = AutoModelForCausalLM.from_pretrained(
"BAAI/Emu2-Chat",
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
trust_remote_code=True)
device_map = infer_auto_device_map(model, max_memory={0:'38GiB',1:'38GiB',}, no_split_module_classes=['Block','LlamaDecoderLayer'])
# input and output logits should be on same device
device_map["model.decoder.lm.lm_head"] = 0
model = load_checkpoint_and_dispatch(
model,
'local/path/to/hf/version/Emu2-Chat/model',
device_map=device_map).eval()
# `[<IMG_PLH>]` is the image placeholder which will be replaced by image embeddings.
# the number of `[<IMG_PLH>]` should be equal to the number of input images
query = '[<IMG_PLH>]Describe the image in details:'
image = Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/blue_black_1_top_left.jpg?raw=true',stream=True).raw).convert('RGB')
inputs = model.build_input_ids(
text=[query],
tokenizer=tokenizer,
image=[image]
)
with torch.no_grad():
outputs = model.generate(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
image=inputs["image"].to(torch.bfloat16),
max_new_tokens=64,
length_penalty=-1)
output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
Interleaved image and text
from PIL import Image
import requests
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch
tokenizer = AutoTokenizer.from_pretrained("BAAI/Emu2-Chat")
with init_empty_weights():
model = AutoModelForCausalLM.from_pretrained(
"BAAI/Emu2-Chat",
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
trust_remote_code=True)
device_map = infer_auto_device_map(model, max_memory={0:'38GiB',1:'38GiB',}, no_split_module_classes=['Block','LlamaDecoderLayer'])
# input and output logits should be on same device
device_map["model.decoder.lm.lm_head"] = 0
model = load_checkpoint_and_dispatch(
model,
'local/path/to/hf/version/Emu2-Chat/model',
device_map=device_map).eval()
# `[<IMG_PLH>]` is the image placeholder which will be replaced by image embeddings.
# the number of `[<IMG_PLH>]` should be equal to the number of input images
query = "[<IMG_PLH>][red, white, 3, bottom left].[<IMG_PLH>][yellow, white, 2, top left].[<IMG_PLH>][green, black, 4, bottom right][<IMG_PLH>]"
images = [
Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/red_white_3_bottom_left.jpg?raw=true',stream=True).raw).convert('RGB'),
Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/yellow_white_2_top_right.jpg?raw=true',stream=True).raw).convert('RGB'),
Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/green_black_4_bottom_right.jpg?raw=true',stream=True).raw).convert('RGB'),
Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/blue_black_1_top_left.jpg?raw=true',stream=True).raw).convert('RGB'),
]
inputs = model.build_input_ids(
text=[query],
tokenizer=tokenizer,
image=images
)
with torch.no_grad():
outputs = model.generate(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
image=inputs["image"].to(torch.bfloat16),
max_new_tokens=64,
length_penalty=-1)
output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
Quantization
Check quantization guidance at transformers
from PIL import Image
import requests
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("BAAI/Emu2-Chat")
model = AutoModelForCausalLM.from_pretrained(
"BAAI/Emu2-Chat",
load_in_4bit=True,
trust_remote_code=True,
bnb_4bit_compute_dtype=torch.float16).eval()
query = '[<IMG_PLH>]Describe the image in details:'
image = Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/blue_black_1_top_left.jpg?raw=true',stream=True).raw).convert('RGB')
inputs = model.build_input_ids(
text=[query],
tokenizer=tokenizer,
image=[image]
)
with torch.no_grad():
outputs = model.generate(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
image=inputs["image"].to(torch.float16), # should be torch.float16
max_new_tokens=64,
length_penalty=-1)
output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
Citation
If you find Emu2 useful for your research and applications, please consider starring this repository and citing:
@article{Emu2,
title={Generative Multimodal Models are In-Context Learners},
author={Quan Sun and Yufeng Cui and Xiaosong Zhang and Fan Zhang and Qiying Yu and Zhengxiong Luo and Yueze Wang and Yongming Rao and Jingjing Liu and Tiejun Huang and Xinlong Wang},
publisher={arXiv preprint arXiv:2312.13286},
year={2023},
}