Salesforce
/

xgen-mm-phi3-mini-base-r-v1

@@ -56,8 +56,8 @@ More technical details will come with a technical report soon.
 ```python
 from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor
-import json
-import PIL
 import IPython.display as display
 import torch
 model_name_or_path = "Salesforce/xgen-mm-phi3-mini-base-r-v1"
@@ -81,14 +81,13 @@ def apply_prompt_template(prompt, num_images=1, num_tokens_per_vis = 128, in_con
     return formatted_prompt
 ############ Zero shot inference ##########
-with open('./test_samples/zero_shot.json') as f:
-    sample = json.load(f)
-instruction = sample['instruction']
-img = PIL.Image.open(sample['image_path'])
 print("==> Instruction: ", instruction)
 print("==> Image: ")
-display.display(img.resize((int(img.width*0.3), int(img.height*0.3))))
-inputs = image_processor([img], return_tensors="pt")
 prompt = apply_prompt_template(instruction)
 language_inputs = tokenizer([prompt], return_tensors="pt")
 inputs.update(language_inputs)
@@ -97,59 +96,15 @@ inputs = {name: tensor.cuda() for name, tensor in inputs.items()}
 with torch.cuda.amp.autocast(dtype=torch.bfloat16):
     generated_text = model.generate(**inputs,
                                     pad_token_id=tokenizer.pad_token_id,
-                                    do_sample=False, max_new_tokens=256, top_p=None, num_beams=1,
-                                    length_penalty=1.0, repetition_penalty=2.0)
 prediction = tokenizer.decode(generated_text[0], skip_special_tokens=True)
 print("==> prediciton: ", prediction)
 print("-"*120)
-# ==> prediciton:  A man sits on a bench in front of the Red Corner Cafe.
-############ Few shots inference ##########
-# prepare in-context examples
-with open('./test_samples/few_shots.json') as f:
-    incontext_data = json.load(f)
-print(f'In-context learning with {len(incontext_data)} examples.')
-context_images, context_text = [], ""
-for example in incontext_data:
-    print("-"*40 + f" {example} " + "-"*40)
-    img = PIL.Image.open(incontext_data[example]['image_path'])
-    instruction = incontext_data[example]['instruction']
-    example_text = apply_prompt_template(prompt=instruction, in_context=True, output=incontext_data[example]['output'])
-    context_images.append(img)
-    context_text += (example_text)
-    print("==> Instruction: ", instruction)
-    print("==> Image: ")
-    display.display(img.resize((int(img.width*0.3), int(img.height*0.3))))
-    print("==> Output: ", incontext_data[example]['output'])
-# prepare test example
-with open('./test_samples/zero_shot.json') as f:
-    sample = json.load(f)
-instruction = "A short description of this image in one sentence:"
-print("-"*40 + " Prediction " + "-"*40)
-img = PIL.Image.open(sample['image_path'])
-print("==> Instruction: ", instruction)
-print("==> Image: ")
-display.display(img.resize((int(img.width*0.3), int(img.height*0.3))))
-prompt = apply_prompt_template(instruction)
-batch_images = context_images + [img]
-batch_text = context_text + prompt
-# prepare inputs
-inputs = image_processor(batch_images, return_tensors="pt")
-language_inputs = tokenizer([batch_text], return_tensors="pt")
-inputs.update(language_inputs)
-inputs = {name: tensor.cuda() for name, tensor in inputs.items()}
-with torch.cuda.amp.autocast(dtype=torch.bfloat16):
-    generated_text = model.generate(**inputs,
-                                    pad_token_id=tokenizer.pad_token_id,
-                                    do_sample=False, max_new_tokens=256, top_p=None, num_beams=1,
-                                    length_penalty=1.0)
-prediction = tokenizer.decode(generated_text[0], skip_special_tokens=True)
-print("==> prediciton: ", prediction)
-print("-"*120)
-# ==> prediciton:  A man sitting on a bench in front of a red building.
 ```
-More comprehensive examples can be found in the [notebook](demo.ipynb).
 # Reproducibility:

 ```python
 from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor
+import requests
+from PIL import Image
 import IPython.display as display
 import torch
 model_name_or_path = "Salesforce/xgen-mm-phi3-mini-base-r-v1"
     return formatted_prompt
 ############ Zero shot inference ##########
+img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
+raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
+instruction = "Describe what is the dog doing in this image in one sentence:"
 print("==> Instruction: ", instruction)
 print("==> Image: ")
+display.display(raw_image.resize((int(raw_image.width*0.3), int(raw_image.height*0.3))))
+inputs = image_processor([raw_image], return_tensors="pt")
 prompt = apply_prompt_template(instruction)
 language_inputs = tokenizer([prompt], return_tensors="pt")
 inputs.update(language_inputs)
 with torch.cuda.amp.autocast(dtype=torch.bfloat16):
     generated_text = model.generate(**inputs,
                                     pad_token_id=tokenizer.pad_token_id,
+                                    do_sample=False, max_new_tokens=64, top_p=None, num_beams=1,
+                                    length_penalty=1.0, repetition_penalty=3.0)
 prediction = tokenizer.decode(generated_text[0], skip_special_tokens=True)
 print("==> prediciton: ", prediction)
 print("-"*120)
+# ==> prediciton:  The dog is sitting on the beach and waving at his own
 ```
+More comprehensive examples can be found in the [notebook](demo.ipynb), where we provide a zero-shot and a few-shot example, respectively.
 # Reproducibility: