Spaces:
Runtime error
Runtime error
from .open_flamingo import create_model_and_transforms as create_open_flamingo_model_and_transforms | |
import torch.nn as nn | |
from transformers import LlamaTokenizer, LlamaForCausalLM | |
def create_model_and_transforms( | |
model_name: str, | |
clip_vision_encoder_path: str, | |
clip_vision_encoder_pretrained: str, | |
lang_encoder_path: str, | |
tokenizer_path: str, | |
tuning_config, | |
pretrained_model_path, | |
**kwargs, | |
): | |
if model_name == "open_flamingo": | |
return create_open_flamingo_model_and_transforms( | |
clip_vision_encoder_path=clip_vision_encoder_path, | |
clip_vision_encoder_pretrained=clip_vision_encoder_pretrained, | |
lang_encoder_path=lang_encoder_path, | |
tokenizer_path=tokenizer_path, | |
tuning_config=tuning_config, | |
pretrained_model_path=pretrained_model_path, | |
**kwargs, | |
) | |
# TODO: support BLIP2 | |
else: | |
raise ValueError(f"Unknown model name: {model_name}") | |
# only for debugging | |
def create_toy_model_and_transforms( | |
model_name: str, | |
clip_vision_encoder_path: str, | |
clip_vision_encoder_pretrained: str, | |
lang_encoder_path: str, | |
tokenizer_path: str, | |
tuning_config, | |
pretrained_model_path, | |
**kwargs, | |
): | |
print("init toy vision encoder") | |
import torchvision | |
image_processor = torchvision.transforms.Compose( | |
[ | |
torchvision.transforms.Resize((224, 224)), | |
torchvision.transforms.ToTensor(), | |
] | |
) | |
print("init tokenizer") | |
text_tokenizer = LlamaTokenizer.from_pretrained(tokenizer_path) | |
# add Flamingo special tokens to the tokenizer | |
text_tokenizer.add_special_tokens({"additional_special_tokens": ["<|endofchunk|>", "<image>"]}) | |
if text_tokenizer.pad_token is None: | |
# Issue: GPT models don't have a pad token, which we use to | |
# modify labels for the loss. | |
text_tokenizer.add_special_tokens({"pad_token": "<PAD>"}) | |
class ToyModel(nn.Module): | |
def __init__(self, *args, **kwargs): | |
super().__init__() | |
self.input_embeddings = nn.Embedding(38000, 512) | |
self.layer = nn.Linear(512, 512) | |
self.config = {"hidden_size": 512} | |
def forward(self, lang_x, **kwargs): | |
x = self.input_embeddings(lang_x) | |
x = self.layer(x) | |
loss = x.sum() | |
return (loss,) | |
model = ToyModel() | |
return model, image_processor, text_tokenizer | |