Update README.md
Browse files
README.md
CHANGED
|
@@ -10,6 +10,7 @@ datasets:
|
|
| 10 |
- ModalityDance/Omni-Bench
|
| 11 |
base_model:
|
| 12 |
- GAIR/Anole-7b-v0.1
|
|
|
|
| 13 |
---
|
| 14 |
|
| 15 |
# Omni-R1
|
|
@@ -30,7 +31,7 @@ from PIL import Image
|
|
| 30 |
from transformers import ChameleonProcessor, ChameleonForConditionalGeneration
|
| 31 |
|
| 32 |
# 1) Import & load
|
| 33 |
-
model_id = "ModalityDance/Omni-R1" # or
|
| 34 |
processor = ChameleonProcessor.from_pretrained(model_id)
|
| 35 |
model = ChameleonForConditionalGeneration.from_pretrained(
|
| 36 |
model_id,
|
|
@@ -39,7 +40,7 @@ model = ChameleonForConditionalGeneration.from_pretrained(
|
|
| 39 |
)
|
| 40 |
model.eval()
|
| 41 |
|
| 42 |
-
# 2) Prepare a single input
|
| 43 |
prompt = "What is the smiling man in the image wearing? <image>"
|
| 44 |
image = Image.open("image.png").convert("RGB")
|
| 45 |
|
|
@@ -51,9 +52,20 @@ inputs = processor(
|
|
| 51 |
return_tensors="pt",
|
| 52 |
).to(model.device)
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
# 3) Call the model
|
| 55 |
outputs = model.generate(
|
| 56 |
-
|
| 57 |
max_length=4096,
|
| 58 |
do_sample=True,
|
| 59 |
temperature=0.5,
|
|
|
|
| 10 |
- ModalityDance/Omni-Bench
|
| 11 |
base_model:
|
| 12 |
- GAIR/Anole-7b-v0.1
|
| 13 |
+
pipeline_tag: any-to-any
|
| 14 |
---
|
| 15 |
|
| 16 |
# Omni-R1
|
|
|
|
| 31 |
from transformers import ChameleonProcessor, ChameleonForConditionalGeneration
|
| 32 |
|
| 33 |
# 1) Import & load
|
| 34 |
+
model_id = "ModalityDance/Omni-R1" # or "ModalityDance/Omni-R1-Zero"
|
| 35 |
processor = ChameleonProcessor.from_pretrained(model_id)
|
| 36 |
model = ChameleonForConditionalGeneration.from_pretrained(
|
| 37 |
model_id,
|
|
|
|
| 40 |
)
|
| 41 |
model.eval()
|
| 42 |
|
| 43 |
+
# 2) Prepare a single input (prompt contains <image>)
|
| 44 |
prompt = "What is the smiling man in the image wearing? <image>"
|
| 45 |
image = Image.open("image.png").convert("RGB")
|
| 46 |
|
|
|
|
| 52 |
return_tensors="pt",
|
| 53 |
).to(model.device)
|
| 54 |
|
| 55 |
+
# --- minimal image token preprocessing: replace <image> placeholder with image tokens ---
|
| 56 |
+
input_ids = inputs["input_ids"].long()
|
| 57 |
+
pixel_values = inputs["pixel_values"]
|
| 58 |
+
|
| 59 |
+
placeholder_id = processor.tokenizer.encode("<image>", add_special_tokens=False)[0]
|
| 60 |
+
image_tokens = model.get_image_tokens(pixel_values) # shape: [1, N] (or compatible)
|
| 61 |
+
|
| 62 |
+
mask = (input_ids == placeholder_id)
|
| 63 |
+
input_ids = input_ids.clone()
|
| 64 |
+
input_ids[mask] = image_tokens.reshape(-1).to(dtype=torch.long, device=input_ids.device)
|
| 65 |
+
|
| 66 |
# 3) Call the model
|
| 67 |
outputs = model.generate(
|
| 68 |
+
input_ids=input_ids,
|
| 69 |
max_length=4096,
|
| 70 |
do_sample=True,
|
| 71 |
temperature=0.5,
|