UncleFish commited on
Commit
6c8221b
1 Parent(s): 9c3357b

update readme

Browse files
Files changed (1) hide show
  1. README.md +11 -56
README.md CHANGED
@@ -56,8 +56,8 @@ More technical details will come with a technical report soon.
56
 
57
  ```python
58
  from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor
59
- import json
60
- import PIL
61
  import IPython.display as display
62
  import torch
63
  model_name_or_path = "Salesforce/xgen-mm-phi3-mini-base-r-v1"
@@ -81,14 +81,13 @@ def apply_prompt_template(prompt, num_images=1, num_tokens_per_vis = 128, in_con
81
  return formatted_prompt
82
 
83
  ############ Zero shot inference ##########
84
- with open('./test_samples/zero_shot.json') as f:
85
- sample = json.load(f)
86
- instruction = sample['instruction']
87
- img = PIL.Image.open(sample['image_path'])
88
  print("==> Instruction: ", instruction)
89
  print("==> Image: ")
90
- display.display(img.resize((int(img.width*0.3), int(img.height*0.3))))
91
- inputs = image_processor([img], return_tensors="pt")
92
  prompt = apply_prompt_template(instruction)
93
  language_inputs = tokenizer([prompt], return_tensors="pt")
94
  inputs.update(language_inputs)
@@ -97,59 +96,15 @@ inputs = {name: tensor.cuda() for name, tensor in inputs.items()}
97
  with torch.cuda.amp.autocast(dtype=torch.bfloat16):
98
  generated_text = model.generate(**inputs,
99
  pad_token_id=tokenizer.pad_token_id,
100
- do_sample=False, max_new_tokens=256, top_p=None, num_beams=1,
101
- length_penalty=1.0, repetition_penalty=2.0)
102
  prediction = tokenizer.decode(generated_text[0], skip_special_tokens=True)
103
  print("==> prediciton: ", prediction)
104
  print("-"*120)
105
- # ==> prediciton: A man sits on a bench in front of the Red Corner Cafe.
106
-
107
- ############ Few shots inference ##########
108
- # prepare in-context examples
109
- with open('./test_samples/few_shots.json') as f:
110
- incontext_data = json.load(f)
111
- print(f'In-context learning with {len(incontext_data)} examples.')
112
- context_images, context_text = [], ""
113
- for example in incontext_data:
114
- print("-"*40 + f" {example} " + "-"*40)
115
- img = PIL.Image.open(incontext_data[example]['image_path'])
116
- instruction = incontext_data[example]['instruction']
117
- example_text = apply_prompt_template(prompt=instruction, in_context=True, output=incontext_data[example]['output'])
118
- context_images.append(img)
119
- context_text += (example_text)
120
- print("==> Instruction: ", instruction)
121
- print("==> Image: ")
122
- display.display(img.resize((int(img.width*0.3), int(img.height*0.3))))
123
- print("==> Output: ", incontext_data[example]['output'])
124
- # prepare test example
125
- with open('./test_samples/zero_shot.json') as f:
126
- sample = json.load(f)
127
- instruction = "A short description of this image in one sentence:"
128
- print("-"*40 + " Prediction " + "-"*40)
129
- img = PIL.Image.open(sample['image_path'])
130
- print("==> Instruction: ", instruction)
131
- print("==> Image: ")
132
- display.display(img.resize((int(img.width*0.3), int(img.height*0.3))))
133
- prompt = apply_prompt_template(instruction)
134
- batch_images = context_images + [img]
135
- batch_text = context_text + prompt
136
- # prepare inputs
137
- inputs = image_processor(batch_images, return_tensors="pt")
138
- language_inputs = tokenizer([batch_text], return_tensors="pt")
139
- inputs.update(language_inputs)
140
- inputs = {name: tensor.cuda() for name, tensor in inputs.items()}
141
- with torch.cuda.amp.autocast(dtype=torch.bfloat16):
142
- generated_text = model.generate(**inputs,
143
- pad_token_id=tokenizer.pad_token_id,
144
- do_sample=False, max_new_tokens=256, top_p=None, num_beams=1,
145
- length_penalty=1.0)
146
- prediction = tokenizer.decode(generated_text[0], skip_special_tokens=True)
147
- print("==> prediciton: ", prediction)
148
- print("-"*120)
149
- # ==> prediciton: A man sitting on a bench in front of a red building.
150
  ```
151
 
152
- More comprehensive examples can be found in the [notebook](demo.ipynb).
153
 
154
  # Reproducibility:
155
 
 
56
 
57
  ```python
58
  from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor
59
+ import requests
60
+ from PIL import Image
61
  import IPython.display as display
62
  import torch
63
  model_name_or_path = "Salesforce/xgen-mm-phi3-mini-base-r-v1"
 
81
  return formatted_prompt
82
 
83
  ############ Zero shot inference ##########
84
+ img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
85
+ raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
86
+ instruction = "Describe what is the dog doing in this image in one sentence:"
 
87
  print("==> Instruction: ", instruction)
88
  print("==> Image: ")
89
+ display.display(raw_image.resize((int(raw_image.width*0.3), int(raw_image.height*0.3))))
90
+ inputs = image_processor([raw_image], return_tensors="pt")
91
  prompt = apply_prompt_template(instruction)
92
  language_inputs = tokenizer([prompt], return_tensors="pt")
93
  inputs.update(language_inputs)
 
96
  with torch.cuda.amp.autocast(dtype=torch.bfloat16):
97
  generated_text = model.generate(**inputs,
98
  pad_token_id=tokenizer.pad_token_id,
99
+ do_sample=False, max_new_tokens=64, top_p=None, num_beams=1,
100
+ length_penalty=1.0, repetition_penalty=3.0)
101
  prediction = tokenizer.decode(generated_text[0], skip_special_tokens=True)
102
  print("==> prediciton: ", prediction)
103
  print("-"*120)
104
+ # ==> prediciton: The dog is sitting on the beach and waving at his own
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  ```
106
 
107
+ More comprehensive examples can be found in the [notebook](demo.ipynb), where we provide a zero-shot and a few-shot example, respectively.
108
 
109
  # Reproducibility:
110