|
import json |
|
import os |
|
import tempfile |
|
from dataclasses import asdict |
|
from typing import Optional |
|
|
|
|
|
from models.vision_transformer import ViT |
|
from models.language_model import LanguageModel |
|
from models.modality_projector import ModalityProjector |
|
from models.config import VLMConfig |
|
|
|
import torch |
|
import torch.nn as nn |
|
import torch.nn.functional as F |
|
from safetensors.torch import load_model, save_model |
|
|
|
class VisionLanguageModel(nn.Module): |
|
def __init__(self, cfg: VLMConfig, load_backbone=True): |
|
super().__init__() |
|
self.cfg = cfg |
|
if load_backbone: |
|
print("Loading from backbone weights") |
|
self.vision_encoder = ViT.from_pretrained(cfg) |
|
self.decoder = LanguageModel.from_pretrained(cfg) |
|
else: |
|
self.vision_encoder = ViT(cfg) |
|
self.decoder = LanguageModel(cfg) |
|
self.MP = ModalityProjector(cfg) |
|
self.load_backbone = load_backbone |
|
|
|
def forward(self, input_ids, image, attention_mask=None, targets=None): |
|
image_embd = self.vision_encoder(image) |
|
image_embd = self.MP(image_embd) |
|
|
|
token_embd = self.decoder.token_embedding(input_ids) |
|
|
|
combined_embd = torch.cat((image_embd, token_embd), dim=1) |
|
|
|
|
|
if attention_mask is not None: |
|
|
|
batch_size = image_embd.size(0) |
|
img_seq_len = image_embd.size(1) |
|
image_attention_mask = torch.ones((batch_size, img_seq_len), device=attention_mask.device, dtype=attention_mask.dtype) |
|
|
|
|
|
attention_mask = torch.cat((image_attention_mask, attention_mask), dim=1) |
|
|
|
logits = self.decoder(combined_embd, attention_mask) |
|
|
|
loss = None |
|
if targets is not None: |
|
|
|
logits = self.decoder.head(logits) |
|
logits = logits[:, image_embd.size(1):, :] |
|
loss = F.cross_entropy(logits.reshape(-1, logits.size(-1)), targets.reshape(-1), ignore_index=-100) |
|
|
|
return logits, loss |
|
|
|
@torch.no_grad() |
|
def generate(self, input_ids, image, attention_mask=None, max_new_tokens=5): |
|
|
|
image_embd = self.vision_encoder(image) |
|
image_embd = self.MP(image_embd) |
|
|
|
|
|
token_embd = self.decoder.token_embedding(input_ids) |
|
|
|
|
|
combined_embd = torch.cat((image_embd, token_embd), dim=1) |
|
|
|
batch_size = image_embd.size(0) |
|
img_seq_len = image_embd.size(1) |
|
|
|
if attention_mask is not None: |
|
|
|
image_attention_mask = torch.ones((batch_size, img_seq_len), device=attention_mask.device, dtype=attention_mask.dtype) |
|
attention_mask = torch.cat((image_attention_mask, attention_mask), dim=1) |
|
|
|
|
|
|
|
|
|
outputs = combined_embd |
|
generated_tokens = torch.zeros((batch_size, max_new_tokens), device=input_ids.device, dtype=input_ids.dtype) |
|
|
|
|
|
for i in range(max_new_tokens): |
|
model_out = self.decoder(outputs, attention_mask) |
|
|
|
|
|
last_token_logits = model_out[:, -1, :] |
|
|
|
|
|
if not self.decoder.lm_use_tokens: |
|
last_token_logits = self.decoder.head(last_token_logits) |
|
|
|
probs = torch.softmax(last_token_logits, dim=-1) |
|
next_token = torch.multinomial(probs, num_samples=1) |
|
|
|
generated_tokens[:, i] = next_token.squeeze(-1) |
|
|
|
|
|
next_embd = self.decoder.token_embedding(next_token) |
|
outputs = torch.cat((outputs, next_embd), dim=1) |
|
|
|
if attention_mask is not None: |
|
attention_mask = torch.cat((attention_mask, torch.ones((batch_size, 1), device=attention_mask.device)), dim=1) |
|
|
|
return generated_tokens |
|
|
|
@classmethod |
|
def from_pretrained( |
|
cls, repo_id_or_path: str, *, revision: Optional[str] = None |
|
) -> "VisionLanguageModel": |
|
""" |
|
Load a VisionLanguageModel from a local directory or a repo on the Hugging Face Hub. |
|
|
|
Args: |
|
repo_id_or_path (str): The path to the local directory or the Hugging Face Hub repo ID. |
|
|
|
Returns: |
|
VisionLanguageModel: The loaded model. |
|
""" |
|
|
|
if os.path.exists(repo_id_or_path): |
|
config_path = os.path.join(repo_id_or_path, "config.json") |
|
weights_path = os.path.join(repo_id_or_path, "model.safetensors") |
|
|
|
if not os.path.exists(config_path): |
|
raise ValueError( |
|
f"Config file not found at {config_path}. Please provide a valid path." |
|
) |
|
if not os.path.exists(weights_path): |
|
raise ValueError( |
|
f"Weights file not found at {weights_path}. Please provide a valid path." |
|
) |
|
|
|
else: |
|
from huggingface_hub import hf_hub_download |
|
|
|
config_path = hf_hub_download( |
|
repo_id=repo_id_or_path, filename="config.json", revision=revision |
|
) |
|
weights_path = hf_hub_download( |
|
repo_id=repo_id_or_path, filename="model.safetensors", revision=revision |
|
) |
|
|
|
|
|
with open(config_path, "r") as f: |
|
cfg = VLMConfig(**json.load(f)) |
|
|
|
|
|
model = cls(cfg, load_backbone=False) |
|
|
|
|
|
load_model(model, weights_path) |
|
|
|
|
|
return model |
|
|
|
def save_pretrained(self, save_directory: str) -> None: |
|
""" |
|
Save the model and configuration to a directory. |
|
|
|
Args: |
|
save_directory (str): The directory to save the model and config. |
|
""" |
|
|
|
os.makedirs(save_directory, exist_ok=True) |
|
|
|
|
|
with open(os.path.join(save_directory, "config.json"), "w") as f: |
|
f.write(json.dumps(asdict(self.cfg), indent=4)) |
|
|
|
|
|
save_model(self, os.path.join(save_directory, "model.safetensors")) |
|
|
|
def push_to_hub(self, repo_id: str, private: bool = False) -> None: |
|
""" |
|
Push the model and configuration to the Hugging Face Hub. |
|
|
|
Args: |
|
repo_id (str): The repo ID on the Hugging Face Hub. |
|
""" |
|
from huggingface_hub import create_repo, upload_folder |
|
|
|
|
|
repo_url = create_repo(repo_id=repo_id, private=private, exist_ok=True) |
|
repo_id = repo_url.repo_id |
|
print("Created repo: ", repo_url) |
|
|
|
with tempfile.TemporaryDirectory() as save_path: |
|
|
|
self.save_pretrained(save_path) |
|
|
|
|
|
with open(os.path.join(save_path, "README.md"), "w") as f: |
|
f.write(MODEL_CARD_TEMPLATE.format(repo_id=repo_id)) |
|
|
|
|
|
return upload_folder( |
|
repo_id=repo_id, |
|
repo_type="model", |
|
folder_path=save_path, |
|
commit_message="Upload nanoVLM using push_to_hub", |
|
) |
|
|
|
|
|
MODEL_CARD_TEMPLATE = """ |
|
--- |
|
# For reference on model card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1 |
|
# Doc / guide: https://huggingface.co/docs/hub/model-cards |
|
library_name: nanovlm |
|
license: mit |
|
pipeline_tag: image-text-to-text |
|
tags: |
|
- vision-language |
|
- multimodal |
|
- research |
|
--- |
|
|
|
**nanoVLM** is a minimal and lightweight Vision-Language Model (VLM) designed for efficient training and experimentation. Built using pure PyTorch, the entire model architecture and training logic fits within ~750 lines of code. It combines a ViT-based image encoder (SigLIP-B/16-224-85M) with a lightweight causal language model (SmolLM2-135M), resulting in a compact 222M parameter model. |
|
|
|
For more information, check out the base model on https://huggingface.co/lusxvr/nanoVLM-222M. |
|
|
|
**Usage:** |
|
|
|
Clone the nanoVLM repository: https://github.com/huggingface/nanoVLM. |
|
Follow the install instructions and run the following code: |
|
|
|
```python |
|
from models.vision_language_model import VisionLanguageModel |
|
|
|
model = VisionLanguageModel.from_pretrained("{repo_id}") |
|
``` |
|
""" |
|
|