|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from pathlib import Path |
|
from types import SimpleNamespace |
|
|
|
import torchvision.transforms as transforms |
|
from PIL import Image |
|
|
|
from m4.models.vopt.modeling_vopt import VOPTConfig, VOPTForCausalLM |
|
from m4.training.packing import image_attention_mask_for_packed_input_ids, incremental_to_binary_attention_mask |
|
from m4.training.utils import get_tokenizer |
|
|
|
|
|
mname_tiny = "tiny-random-vopt-clip" |
|
|
|
path = Path(mname_tiny) |
|
path.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
additional_vocab_size = 2 |
|
|
|
config = VOPTConfig() |
|
config.update( |
|
dict( |
|
ffn_dim=64, |
|
hidden_size=16, |
|
max_position_embeddings=128, |
|
num_attention_heads=4, |
|
num_hidden_layers=2, |
|
word_embed_proj_dim=16, |
|
max_new_tokens=100, |
|
use_resampler=True, |
|
resampler_depth=2, |
|
resampler_head_dim=8, |
|
resampler_n_heads=2, |
|
resampler_n_latents=16, |
|
vision_embed_dim=32, |
|
vision_image_size=30, |
|
vision_model_name="hf-internal-testing/tiny-random-clip", |
|
vision_model_params="{}", |
|
vocab_size=50265, |
|
additional_vocab_size=additional_vocab_size, |
|
) |
|
) |
|
|
|
|
|
|
|
|
|
model = VOPTForCausalLM.from_config(config) |
|
|
|
|
|
|
|
tokenizer_config = dict( |
|
tokenizer_add_special_tokens="{}", |
|
tokenizer_add_tokens=( |
|
'[AddedToken("<fake_token_around_image>", rstrip=False, lstrip=False), AddedToken("<image>", rstrip=False,' |
|
" lstrip=False)]" |
|
), |
|
tokenizer_name="facebook/opt-13b", |
|
tokenizer_params='{"use_fast":True}', |
|
) |
|
tokenizer_config = SimpleNamespace(**tokenizer_config) |
|
|
|
|
|
tokenizer = get_tokenizer( |
|
tokenizer_name=tokenizer_config.tokenizer_name, |
|
tokenizer_add_tokens=tokenizer_config.tokenizer_add_tokens, |
|
tokenizer_add_special_tokens=tokenizer_config.tokenizer_add_special_tokens, |
|
tokenizer_params=tokenizer_config.tokenizer_params, |
|
additional_vocab_size=model.config.additional_vocab_size, |
|
model_vocab_size=model.config.vocab_size, |
|
) |
|
assert "<image>" in tokenizer.get_vocab() |
|
|
|
|
|
query = "<fake_token_around_image><image><fake_token_around_image>This is a picture of a cat." |
|
query_tokens = tokenizer(query, return_tensors="pt") |
|
|
|
num_images_per_ex = 1 |
|
pixel_values = transforms.ToTensor()(Image.new("RGB", (30, 30))).repeat(1, 1, 1, 1).unsqueeze(0) |
|
image_attention_mask, _ = image_attention_mask_for_packed_input_ids(query_tokens["input_ids"], tokenizer) |
|
image_attention_mask = incremental_to_binary_attention_mask(image_attention_mask, num_classes=num_images_per_ex) |
|
|
|
input = { |
|
"input_ids": query_tokens["input_ids"], |
|
"attention_mask": query_tokens["attention_mask"], |
|
"pixel_values": pixel_values, |
|
"pixel_values": pixel_values, |
|
"image_attention_mask": image_attention_mask, |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
out_gen = model.generate(**input) |
|
text = tokenizer.batch_decode(out_gen) |
|
|
|
|
|
|
|
model.half() |
|
model.save_pretrained(path) |
|
tokenizer.save_pretrained(path) |
|
|
|
|
|
model = VOPTForCausalLM.from_pretrained(path) |
|
|
|
print(f"Generated {mname_tiny} - Upload the generated folder to the hub") |
|
|