Image-Text-to-Text
Transformers
Safetensors
English
florence2
text-generation
art
custom_code
Edit model card
pip install -q datasets flash_attn timm einops
from transformers import AutoModelForCausalLM, AutoProcessor, AutoConfig
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForCausalLM.from_pretrained("gokaygokay/Florence-2-SD3-Captioner", trust_remote_code=True).to(device).eval()
processor = AutoProcessor.from_pretrained("gokaygokay/Florence-2-SD3-Captioner", trust_remote_code=True)

# Function to run the model on an example
def run_example(task_prompt, text_input, image):
    prompt = task_prompt + text_input

    # Ensure the image is in RGB mode
    if image.mode != "RGB":
        image = image.convert("RGB")

    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        num_beams=3
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
    return parsed_answer

from PIL import Image
import requests
import copy

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
image = Image.open(requests.get(url, stream=True).raw)
run_example("<DESCRIPTION>", "Describe this image in great detail.", image)

# {'<DESCRIPTION>': 'Captured at eye-level on a sunny day, a light blue Volkswagen Beetle is parked on a cobblestone street. The beetle is parked in front of a yellow building with two brown doors. The door on the right side of the frame is white, while the left side is a darker shade of blue. The car is facing the camera, and the car is positioned in the middle of the street.'}
Downloads last month
1,010
Safetensors
Model size
271M params
Tensor type
F32
Β·
Inference API (serverless) does not yet support model repos that contain custom code.

Datasets used to train gokaygokay/Florence-2-SD3-Captioner

Spaces using gokaygokay/Florence-2-SD3-Captioner 12