update readme
Browse files
README.md
CHANGED
@@ -56,8 +56,8 @@ More technical details will come with a technical report soon.
|
|
56 |
|
57 |
```python
|
58 |
from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor
|
59 |
-
import
|
60 |
-
import
|
61 |
import IPython.display as display
|
62 |
import torch
|
63 |
model_name_or_path = "Salesforce/xgen-mm-phi3-mini-base-r-v1"
|
@@ -81,14 +81,13 @@ def apply_prompt_template(prompt, num_images=1, num_tokens_per_vis = 128, in_con
|
|
81 |
return formatted_prompt
|
82 |
|
83 |
############ Zero shot inference ##########
|
84 |
-
|
85 |
-
|
86 |
-
instruction =
|
87 |
-
img = PIL.Image.open(sample['image_path'])
|
88 |
print("==> Instruction: ", instruction)
|
89 |
print("==> Image: ")
|
90 |
-
display.display(
|
91 |
-
inputs = image_processor([
|
92 |
prompt = apply_prompt_template(instruction)
|
93 |
language_inputs = tokenizer([prompt], return_tensors="pt")
|
94 |
inputs.update(language_inputs)
|
@@ -97,59 +96,15 @@ inputs = {name: tensor.cuda() for name, tensor in inputs.items()}
|
|
97 |
with torch.cuda.amp.autocast(dtype=torch.bfloat16):
|
98 |
generated_text = model.generate(**inputs,
|
99 |
pad_token_id=tokenizer.pad_token_id,
|
100 |
-
do_sample=False, max_new_tokens=
|
101 |
-
length_penalty=1.0, repetition_penalty=
|
102 |
prediction = tokenizer.decode(generated_text[0], skip_special_tokens=True)
|
103 |
print("==> prediciton: ", prediction)
|
104 |
print("-"*120)
|
105 |
-
# ==> prediciton:
|
106 |
-
|
107 |
-
############ Few shots inference ##########
|
108 |
-
# prepare in-context examples
|
109 |
-
with open('./test_samples/few_shots.json') as f:
|
110 |
-
incontext_data = json.load(f)
|
111 |
-
print(f'In-context learning with {len(incontext_data)} examples.')
|
112 |
-
context_images, context_text = [], ""
|
113 |
-
for example in incontext_data:
|
114 |
-
print("-"*40 + f" {example} " + "-"*40)
|
115 |
-
img = PIL.Image.open(incontext_data[example]['image_path'])
|
116 |
-
instruction = incontext_data[example]['instruction']
|
117 |
-
example_text = apply_prompt_template(prompt=instruction, in_context=True, output=incontext_data[example]['output'])
|
118 |
-
context_images.append(img)
|
119 |
-
context_text += (example_text)
|
120 |
-
print("==> Instruction: ", instruction)
|
121 |
-
print("==> Image: ")
|
122 |
-
display.display(img.resize((int(img.width*0.3), int(img.height*0.3))))
|
123 |
-
print("==> Output: ", incontext_data[example]['output'])
|
124 |
-
# prepare test example
|
125 |
-
with open('./test_samples/zero_shot.json') as f:
|
126 |
-
sample = json.load(f)
|
127 |
-
instruction = "A short description of this image in one sentence:"
|
128 |
-
print("-"*40 + " Prediction " + "-"*40)
|
129 |
-
img = PIL.Image.open(sample['image_path'])
|
130 |
-
print("==> Instruction: ", instruction)
|
131 |
-
print("==> Image: ")
|
132 |
-
display.display(img.resize((int(img.width*0.3), int(img.height*0.3))))
|
133 |
-
prompt = apply_prompt_template(instruction)
|
134 |
-
batch_images = context_images + [img]
|
135 |
-
batch_text = context_text + prompt
|
136 |
-
# prepare inputs
|
137 |
-
inputs = image_processor(batch_images, return_tensors="pt")
|
138 |
-
language_inputs = tokenizer([batch_text], return_tensors="pt")
|
139 |
-
inputs.update(language_inputs)
|
140 |
-
inputs = {name: tensor.cuda() for name, tensor in inputs.items()}
|
141 |
-
with torch.cuda.amp.autocast(dtype=torch.bfloat16):
|
142 |
-
generated_text = model.generate(**inputs,
|
143 |
-
pad_token_id=tokenizer.pad_token_id,
|
144 |
-
do_sample=False, max_new_tokens=256, top_p=None, num_beams=1,
|
145 |
-
length_penalty=1.0)
|
146 |
-
prediction = tokenizer.decode(generated_text[0], skip_special_tokens=True)
|
147 |
-
print("==> prediciton: ", prediction)
|
148 |
-
print("-"*120)
|
149 |
-
# ==> prediciton: A man sitting on a bench in front of a red building.
|
150 |
```
|
151 |
|
152 |
-
More comprehensive examples can be found in the [notebook](demo.ipynb).
|
153 |
|
154 |
# Reproducibility:
|
155 |
|
|
|
56 |
|
57 |
```python
|
58 |
from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor
|
59 |
+
import requests
|
60 |
+
from PIL import Image
|
61 |
import IPython.display as display
|
62 |
import torch
|
63 |
model_name_or_path = "Salesforce/xgen-mm-phi3-mini-base-r-v1"
|
|
|
81 |
return formatted_prompt
|
82 |
|
83 |
############ Zero shot inference ##########
|
84 |
+
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
|
85 |
+
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
|
86 |
+
instruction = "Describe what is the dog doing in this image in one sentence:"
|
|
|
87 |
print("==> Instruction: ", instruction)
|
88 |
print("==> Image: ")
|
89 |
+
display.display(raw_image.resize((int(raw_image.width*0.3), int(raw_image.height*0.3))))
|
90 |
+
inputs = image_processor([raw_image], return_tensors="pt")
|
91 |
prompt = apply_prompt_template(instruction)
|
92 |
language_inputs = tokenizer([prompt], return_tensors="pt")
|
93 |
inputs.update(language_inputs)
|
|
|
96 |
with torch.cuda.amp.autocast(dtype=torch.bfloat16):
|
97 |
generated_text = model.generate(**inputs,
|
98 |
pad_token_id=tokenizer.pad_token_id,
|
99 |
+
do_sample=False, max_new_tokens=64, top_p=None, num_beams=1,
|
100 |
+
length_penalty=1.0, repetition_penalty=3.0)
|
101 |
prediction = tokenizer.decode(generated_text[0], skip_special_tokens=True)
|
102 |
print("==> prediciton: ", prediction)
|
103 |
print("-"*120)
|
104 |
+
# ==> prediciton: The dog is sitting on the beach and waving at his own
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
```
|
106 |
|
107 |
+
More comprehensive examples can be found in the [notebook](demo.ipynb), where we provide a zero-shot and a few-shot example, respectively.
|
108 |
|
109 |
# Reproducibility:
|
110 |
|