BleachNick commited on
Commit
ab9a3c5
·
1 Parent(s): 1a96366

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +30 -13
README.md CHANGED
@@ -7,8 +7,15 @@ library_name: transformers
7
 
8
  # Model Card for MMICL
9
 
 
 
 
 
 
 
 
10
  ## Temporal Demo for MMICL
11
- [Playground for MMICL-FLANT5XXL](https://bcd7bc41d42486e7c8.gradio.live/)
12
  support multi-image input as well as video input.
13
  <!-- Provide a quick summary of what the model is/does. -->
14
 
@@ -54,29 +61,33 @@ import transformers
54
  from PIL import Image
55
  import torch
56
  model_type="instructblip"
57
- model_ckpt="BleachNick/MMICL-Instructblip-T5-xxl"
58
- config_ckpt = "Salesforce/instructblip-flan-t5-xxl"
59
- config = InstructBlipConfig.from_pretrained(config_ckpt )
60
 
61
  if 'instructblip' in model_type:
62
  model = InstructBlipForConditionalGeneration.from_pretrained(
63
  model_ckpt,
64
  config=config).to('cuda:0',dtype=torch.bfloat16)
65
 
66
-
67
- sp = ["图"]+[f"<image{i}>" for i in range(20)]
68
-
69
  processor = InstructBlipProcessor.from_pretrained(
70
- model_ckpt
71
  )
72
-
73
-
74
  sp = sp+processor.tokenizer.additional_special_tokens[len(sp):]
75
  processor.tokenizer.add_special_tokens({'additional_special_tokens':sp})
 
 
 
 
76
 
 
 
 
 
77
 
78
- prompt = ['Use the image 0: <image0>图,image 1: <image1>图 and image 2: <image2>图 as a visual aid to help you calculate the equation accurately. image 0 is 2+1=3.\nimage 1 is 5+6=11.\nimage 2 is"']
79
- # images try to load the images to be a list of PIL.Image object.
80
  prompt = " ".join(prompt)
81
 
82
  inputs = processor(images=images, text=prompt, return_tensors="pt")
@@ -90,10 +101,16 @@ outputs = model.generate(
90
  pixel_values = inputs['pixel_values'],
91
  input_ids = inputs['input_ids'],
92
  attention_mask = inputs['attention_mask'],
93
- img_mask = inputs['img_mask']
 
 
 
 
94
  )
95
  generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
96
  print(generated_text)
 
 
97
 
98
  ```
99
 
 
7
 
8
  # Model Card for MMICL
9
 
10
+ # News 🚀
11
+ 1. [09-19] We have converted the MMICL demo to a permanent link: [Demo for MMICL](http://www.testmmicl.work). The Vicuna version of MMICL and Chat Mode are presently under development, so they may require careful adjustment of generation parameters and may not work correctly.
12
+ 2. [09-15] Our [paper](https://arxiv.org/abs/2309.07915) has been uploaded to arXiv.
13
+ 3. [09-01] The [MIC](https://huggingface.co/datasets/BleachNick/MIC_full) data has released on the huggingface hub.
14
+ 4. [08-23] Reach the 1st on [MME](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation), 1st on [MMBench](https://opencompass.org.cn/leaderboard-multimodal)
15
+ 5. [08-21] The [MMICL-FLANT5XXL](https://huggingface.co/BleachNick/MMICL-Instructblip-T5-xxl) and [MMICL-Tiny](https://huggingface.co/BleachNick/MMICL-Instructblip-T5-xl) model has released on the huggingface hub.
16
+
17
  ## Temporal Demo for MMICL
18
+ [Playground for MMICL-FLANT5XXL](http://www.testmmicl.work/)
19
  support multi-image input as well as video input.
20
  <!-- Provide a quick summary of what the model is/does. -->
21
 
 
61
  from PIL import Image
62
  import torch
63
  model_type="instructblip"
64
+ model_ckpt="/home/haozhezhao/MMICL-Instructblip-T5-xxl"
65
+ processor_ckpt = "Salesforce/instructblip-flan-t5-xxl"
66
+ config = InstructBlipConfig.from_pretrained(model_ckpt )
67
 
68
  if 'instructblip' in model_type:
69
  model = InstructBlipForConditionalGeneration.from_pretrained(
70
  model_ckpt,
71
  config=config).to('cuda:0',dtype=torch.bfloat16)
72
 
73
+ image_palceholder="图"
74
+ sp = [image_palceholder]+[f"<image{i}>" for i in range(20)]
 
75
  processor = InstructBlipProcessor.from_pretrained(
76
+ processor_ckpt
77
  )
 
 
78
  sp = sp+processor.tokenizer.additional_special_tokens[len(sp):]
79
  processor.tokenizer.add_special_tokens({'additional_special_tokens':sp})
80
+ if model.qformer.embeddings.word_embeddings.weight.shape[0] != len(processor.qformer_tokenizer):
81
+ model.qformer.resize_token_embeddings(len(processor.qformer_tokenizer))
82
+ replace_token="".join(32*[image_palceholder])
83
+
84
 
85
+ image = Image.open ("images/cal_num1.png")
86
+ image1 = Image.open ("images/cal_num2.png")
87
+ image2 = Image.open ("images/cal_num3.png")
88
+ images = [image,image1,image2]
89
 
90
+ prompt = [f'Use the image 0: <image0>{replace_token},image 1: <image1>{replace_token} and image 2: <image2>{replace_token} as a visual aid to help you calculate the equation accurately. image 0 is 2+1=3.\nimage 1 is 5+6=11.\nimage 2 is"']
 
91
  prompt = " ".join(prompt)
92
 
93
  inputs = processor(images=images, text=prompt, return_tensors="pt")
 
101
  pixel_values = inputs['pixel_values'],
102
  input_ids = inputs['input_ids'],
103
  attention_mask = inputs['attention_mask'],
104
+ img_mask = inputs['img_mask'],
105
+ do_sample=False,
106
+ max_length=50,
107
+ min_length=1,
108
+ set_min_padding_size =False,
109
  )
110
  generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
111
  print(generated_text)
112
+ # output: 3x6=18"
113
+
114
 
115
  ```
116