Any-to-Any
Transformers
Safetensors
chameleon
image-to-text
multimodal
reasoning
sft
rl
charlesdj commited on
Commit
ef1738f
·
verified ·
1 Parent(s): 13e998e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +15 -3
README.md CHANGED
@@ -10,6 +10,7 @@ datasets:
10
  - ModalityDance/Omni-Bench
11
  base_model:
12
  - GAIR/Anole-7b-v0.1
 
13
  ---
14
 
15
  # Omni-R1
@@ -30,7 +31,7 @@ from PIL import Image
30
  from transformers import ChameleonProcessor, ChameleonForConditionalGeneration
31
 
32
  # 1) Import & load
33
- model_id = "ModalityDance/Omni-R1" # or a local checkpoint path
34
  processor = ChameleonProcessor.from_pretrained(model_id)
35
  model = ChameleonForConditionalGeneration.from_pretrained(
36
  model_id,
@@ -39,7 +40,7 @@ model = ChameleonForConditionalGeneration.from_pretrained(
39
  )
40
  model.eval()
41
 
42
- # 2) Prepare a single input
43
  prompt = "What is the smiling man in the image wearing? <image>"
44
  image = Image.open("image.png").convert("RGB")
45
 
@@ -51,9 +52,20 @@ inputs = processor(
51
  return_tensors="pt",
52
  ).to(model.device)
53
 
 
 
 
 
 
 
 
 
 
 
 
54
  # 3) Call the model
55
  outputs = model.generate(
56
- **inputs,
57
  max_length=4096,
58
  do_sample=True,
59
  temperature=0.5,
 
10
  - ModalityDance/Omni-Bench
11
  base_model:
12
  - GAIR/Anole-7b-v0.1
13
+ pipeline_tag: any-to-any
14
  ---
15
 
16
  # Omni-R1
 
31
  from transformers import ChameleonProcessor, ChameleonForConditionalGeneration
32
 
33
  # 1) Import & load
34
+ model_id = "ModalityDance/Omni-R1" # or "ModalityDance/Omni-R1-Zero"
35
  processor = ChameleonProcessor.from_pretrained(model_id)
36
  model = ChameleonForConditionalGeneration.from_pretrained(
37
  model_id,
 
40
  )
41
  model.eval()
42
 
43
+ # 2) Prepare a single input (prompt contains <image>)
44
  prompt = "What is the smiling man in the image wearing? <image>"
45
  image = Image.open("image.png").convert("RGB")
46
 
 
52
  return_tensors="pt",
53
  ).to(model.device)
54
 
55
+ # --- minimal image token preprocessing: replace <image> placeholder with image tokens ---
56
+ input_ids = inputs["input_ids"].long()
57
+ pixel_values = inputs["pixel_values"]
58
+
59
+ placeholder_id = processor.tokenizer.encode("<image>", add_special_tokens=False)[0]
60
+ image_tokens = model.get_image_tokens(pixel_values) # shape: [1, N] (or compatible)
61
+
62
+ mask = (input_ids == placeholder_id)
63
+ input_ids = input_ids.clone()
64
+ input_ids[mask] = image_tokens.reshape(-1).to(dtype=torch.long, device=input_ids.device)
65
+
66
  # 3) Call the model
67
  outputs = model.generate(
68
+ input_ids=input_ids,
69
  max_length=4096,
70
  do_sample=True,
71
  temperature=0.5,