Edit model card

Usage Example

import requests
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor, BitsAndBytesConfig


def get_image_description(model, processor, image, initial_prompt='', max_new_tokens=70, *args, **kwargs):
    initial_prompt = initial_prompt if initial_prompt != '' else "How would you describe the contents of this photo?"
    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": initial_prompt}
        ]}
    ]
    input_text = processor.apply_chat_template(
        messages, add_generation_prompt=True)
    inputs = processor(
        image,
        input_text,
        add_special_tokens=False,
        return_tensors="pt"
    ).to(model.device)
    output = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return processor.decode(output[0])


def load_model(model_id="belkhir-nacim/l32vision_instruct"):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,  # Enable 4-bit quantization
    )
    model = MllamaForConditionalGeneration.from_pretrained(
        model_id, device_map="auto",quantization_config=bnb_config)
    processor = AutoProcessor.from_pretrained(model_id)
    return model, processor


model, processor = load_model()

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
image = Image.open(requests.get(url, stream=True).raw)
result = get_image_description(
    model, processor, image, initial_prompt="Tell me what do you see in the image. use keywords to describe")
print(result)
Downloads last month
273
Safetensors
Model size
6.05B params
Tensor type
F32
FP16
U8
Inference Examples
Inference API (serverless) does not yet support transformers models for this pipeline type.