File size: 1,802 Bytes
11b81b8
 
 
 
 
 
 
 
b4627e7
4119334
 
11b81b8
41e5269
cc56bee
d17454c
cc56bee
41e5269
cc56bee
41e5269
cc56bee
 
41e5269
cc56bee
41e5269
cc56bee
 
41e5269
cc56bee
 
 
41e5269
cc56bee
 
 
41e5269
cc56bee
 
 
 
 
7f67ff1
 
cc56bee
 
 
 
41e5269
cc56bee
 
 
41e5269
cc56bee
 
28cede3
41e5269
cc56bee
 
74edd87
cc56bee
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
---
license: apache-2.0
language:
- en
library_name: transformers
pipeline_tag: image-text-to-text
tags:
- art
base_model: microsoft/Florence-2-base
datasets:
- kadirnar/fluxdev_controlnet_16k
---

```
pip install -q torch==2.4.0 datasets flash_attn timm einops
```

```python

from transformers import AutoModelForCausalLM, AutoProcessor, AutoConfig
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForCausalLM.from_pretrained("gokaygokay/Florence-2-Flux", trust_remote_code=True).to(device).eval()
processor = AutoProcessor.from_pretrained("gokaygokay/Florence-2-Flux", trust_remote_code=True)

# Function to run the model on an example
def run_example(task_prompt, text_input, image):
    prompt = task_prompt + text_input

    # Ensure the image is in RGB mode
    if image.mode != "RGB":
        image = image.convert("RGB")

    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        num_beams=3,
        repetition_penalty=1.10,
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
    return parsed_answer

from PIL import Image
import requests
import copy

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
image = Image.open(requests.get(url, stream=True).raw)
answer = run_example("<DESCRIPTION>", "Describe this image in great detail.", image)

final_answer = answer["<DESCRIPTION>"]
print(final_answer)
  
```