|
--- |
|
license: apache-2.0 |
|
datasets: |
|
- google/docci |
|
- google/imageinwords |
|
language: |
|
- en |
|
library_name: transformers |
|
pipeline_tag: image-text-to-text |
|
tags: |
|
- art |
|
--- |
|
|
|
Fine-tuned version of PaliGemma 224x224 on [google/docci](https://huggingface.co/datasets/google/docci) and [google/imageinwords](https://huggingface.co/datasets/google/imageinwords) datasets. |
|
``` |
|
pip install git+https://github.com/huggingface/transformers |
|
``` |
|
|
|
```python |
|
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration |
|
from PIL import Image |
|
import requests |
|
import torch |
|
|
|
model_id = "gokaygokay/sd3-long-captioner-v2" |
|
|
|
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true" |
|
image = Image.open(requests.get(url, stream=True).raw) |
|
|
|
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval() |
|
processor = AutoProcessor.from_pretrained(model_id) |
|
|
|
## prefix |
|
prompt = "caption en" |
|
model_inputs = processor(text=prompt, images=image, return_tensors="pt") |
|
input_len = model_inputs["input_ids"].shape[-1] |
|
|
|
with torch.inference_mode(): |
|
generation = model.generate(**model_inputs, repetition_penalty=1.10, max_new_tokens=256, do_sample=False) |
|
generation = generation[0][input_len:] |
|
decoded = processor.decode(generation, skip_special_tokens=True) |
|
print(decoded) |
|
|
|
``` |