File size: 3,825 Bytes
e7d3e35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python

# This script creates a super tiny model that is useful inside tests, when we just want to test that
# the machinery works, without needing to check the quality of the outcomes.
#
# usage: adjust the configs if wanted, but otherwise just run the script

from pathlib import Path
from types import SimpleNamespace

import torchvision.transforms as transforms
from PIL import Image

from m4.models.vllama.modeling_vllama import VLlamaConfig, VLlamaForCausalLM
from m4.training.packing import image_attention_mask_for_packed_input_ids, incremental_to_binary_attention_mask
from m4.training.utils import get_tokenizer


mname_tiny = "tiny-random-vllama-clip"

path = Path(mname_tiny)
path.mkdir(parents=True, exist_ok=True)

# from the hardcoded https://github.com/huggingface/m4/blob/adf102f0000cb2632cd8a3ebb87398c65e448a97/m4/training/main.py#L80
additional_vocab_size = 2

config = VLlamaConfig()
config.update(
    dict(
        ffn_dim=64,
        hidden_size=16,
        max_position_embeddings=128,
        num_attention_heads=4,
        num_hidden_layers=2,
        word_embed_proj_dim=16,
        max_new_tokens=100,
        use_resampler=True,
        resampler_depth=2,
        resampler_head_dim=8,
        resampler_n_heads=2,
        resampler_n_latents=16,
        vision_embed_dim=32,
        vision_image_size=30,
        vision_model_name="hf-internal-testing/tiny-random-clip",
        vision_model_params="{}",
        vocab_size=32000,
        additional_vocab_size=additional_vocab_size,
    )
)

# print(config)
# can now modify config to say tiny values

model = VLlamaForCausalLM.from_config(config)
# print(model.config)
# print(model)

tokenizer_config = dict(
    tokenizer_add_special_tokens="{}",
    tokenizer_add_tokens=(
        '[AddedToken("<fake_token_around_image>", rstrip=False, lstrip=False), AddedToken("<image>", rstrip=False,'
        " lstrip=False)]"
    ),
    tokenizer_name="HuggingFaceM4/huggy-llama-tokenizer-7b",
    tokenizer_params='{"use_fast": True}',
)
tokenizer_config = SimpleNamespace(**tokenizer_config)
# print(tokenizer_config)

tokenizer = get_tokenizer(
    tokenizer_name=tokenizer_config.tokenizer_name,
    tokenizer_add_tokens=tokenizer_config.tokenizer_add_tokens,
    tokenizer_add_special_tokens=tokenizer_config.tokenizer_add_special_tokens,
    tokenizer_params=tokenizer_config.tokenizer_params,
    additional_vocab_size=model.config.additional_vocab_size,
    model_vocab_size=model.config.vocab_size,
)
assert "<image>" in tokenizer.get_vocab()

# Test w/ one image and one text
query = "<fake_token_around_image><image><fake_token_around_image>This is a picture of a cat."
query_tokens = tokenizer(query, return_tensors="pt")

num_images_per_ex = 1
pixel_values = transforms.ToTensor()(Image.new("RGB", (30, 30))).repeat(1, 1, 1, 1).unsqueeze(0)
image_attention_mask, _ = image_attention_mask_for_packed_input_ids(query_tokens["input_ids"], tokenizer)
image_attention_mask = incremental_to_binary_attention_mask(image_attention_mask, num_classes=num_images_per_ex)

input = {
    "input_ids": query_tokens["input_ids"],
    "attention_mask": query_tokens["attention_mask"],
    "pixel_values": pixel_values,
    "pixel_values": pixel_values,
    "image_attention_mask": image_attention_mask,
}
# debug shapes
# print(query_tokens["input_ids"].shape)
# print(query_tokens["attention_mask"].shape)
# print(pixel_values.shape)
# print(image_attention_mask.shape)

out_gen = model.generate(**input)
text = tokenizer.batch_decode(out_gen)
# print(text)

# Save model + config + tokenizer
model.half()  # makes it smaller
model.save_pretrained(path)
tokenizer.save_pretrained(path)

# test we can load it back
model = VLlamaForCausalLM.from_pretrained(path)

print(f"Generated {mname_tiny} - Upload the generated folder to the hub")