VictorSanh commited on
Commit
dc6da18
1 Parent(s): fd388f6

update with newest transformers integration

Browse files
app_dialogue.py CHANGED
@@ -6,53 +6,37 @@ import time
6
  import torch
7
 
8
  from threading import Thread
9
- from typing import List, Tuple
10
  from urllib.parse import urlparse
11
  from PIL import Image
12
 
13
  import gradio as gr
14
- from gradio_client.client import DEFAULT_TEMP_DIR
15
- from transformers import AutoProcessor, AutoModelForCausalLM, TextIteratorStreamer
16
- from transformers.image_utils import to_numpy_array, PILImageResampling, ChannelDimension
17
- from transformers.image_transforms import resize, to_channel_dimension_format
18
 
 
19
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 
 
20
 
21
  DEVICE = torch.device("cuda")
22
  MODELS = {
23
- "tr_290_bis_288_cinco_chatty - opt 150": AutoModelForCausalLM.from_pretrained(
24
- "HuggingFaceM4/idefics2",
25
- trust_remote_code=True,
26
  torch_dtype=torch.bfloat16,
27
- token=os.environ["HF_AUTH_TOKEN"],
28
- revision="9e47f905a9e262451c749286fcb97516cedff6d3",
29
- ).to(DEVICE),
30
- "tr_288_cinco_final_sft_sphinx - opt 11'000": AutoModelForCausalLM.from_pretrained(
31
- "HuggingFaceM4/idefics2",
32
  trust_remote_code=True,
33
- torch_dtype=torch.bfloat16,
34
  token=os.environ["HF_AUTH_TOKEN"],
35
- revision="316ea4acf714760882ad89e364ae1f8c447ae82e",
36
  ).to(DEVICE),
37
- # "285 - continued pretraining on text sft - opt 2'000": AutoModelForCausalLM.from_pretrained(
38
- # "HuggingFaceM4/idefics2",
39
- # trust_remote_code=True,
40
- # torch_dtype=torch.bfloat16,
41
- # token=os.environ["HF_AUTH_TOKEN"],
42
- # revision="b0a2a564e5dc311591886bb375e8d5a1aeaade83",
43
- # ).to(DEVICE),
44
  }
45
  PROCESSOR = AutoProcessor.from_pretrained(
46
- "HuggingFaceM4/idefics2",
47
  token=os.environ["HF_AUTH_TOKEN"],
48
  )
49
- FAKE_TOK_AROUND_IMAGE = "<fake_token_around_image>"
50
- BOS_TOKEN = PROCESSOR.tokenizer.bos_token
51
  BAD_WORDS_IDS = PROCESSOR.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
52
  EOS_WORDS_IDS = PROCESSOR.tokenizer("<end_of_utterance>", add_special_tokens=False).input_ids + [PROCESSOR.tokenizer.eos_token_id]
53
- IMAGE_SEQ_LEN = 64#list(MODELS.values())[0].config.perceiver_config.resampler_n_latents
54
 
55
- SYSTEM_PROMPT = [
56
  # """The following is a conversation between a highly knowledgeable and intelligent visual AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant has the ability to perceive images and reason about the content of visual inputs. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.
57
 
58
  # The conversation begins:""",
@@ -83,127 +67,14 @@ API_TOKEN = os.getenv("HF_AUTH_TOKEN")
83
  BOT_AVATAR = "IDEFICS_logo.png"
84
 
85
 
86
- # Model processing utils - these will be handled in the model processor directly ultimately
87
- def convert_to_rgb(image):
88
- # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
89
- # for transparent images. The call to `alpha_composite` handles this case
90
- if image.mode == "RGB":
91
- return image
92
-
93
- image_rgba = image.convert("RGBA")
94
- background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
95
- alpha_composite = Image.alpha_composite(background, image_rgba)
96
- alpha_composite = alpha_composite.convert("RGB")
97
- return alpha_composite
98
-
99
-
100
- def custom_transform(x):
101
- x = convert_to_rgb(x)
102
- x = to_numpy_array(x)
103
-
104
- height, width = x.shape[:2]
105
- aspect_ratio = width / height
106
- if width >= height and width > 980:
107
- width = 980
108
- height = int(width / aspect_ratio)
109
- elif height > width and height > 980:
110
- height = 980
111
- width = int(height * aspect_ratio)
112
- width = max(width, 378)
113
- height = max(height, 378)
114
-
115
- x = resize(x, (height, width), resample=PILImageResampling.BILINEAR)
116
- x = PROCESSOR.image_processor.rescale(x, scale=1 / 255)
117
- x = PROCESSOR.image_processor.normalize(
118
- x,
119
- mean=PROCESSOR.image_processor.image_mean,
120
- std=PROCESSOR.image_processor.image_std
121
- )
122
- x = to_channel_dimension_format(x, ChannelDimension.FIRST)
123
- x = torch.tensor(x)
124
- return x
125
-
126
-
127
- def create_model_inputs(
128
- input_texts: List[str],
129
- image_lists: List[List[Image.Image]],
130
- ):
131
- """
132
- All this logic will eventually be handled inside the model processor.
133
- """
134
- inputs = PROCESSOR.tokenizer(
135
- input_texts,
136
- return_tensors="pt",
137
- add_special_tokens=False,
138
- padding=True,
139
- )
140
-
141
- output_images = [
142
- [PROCESSOR.image_processor(img, transform=custom_transform) for img in im_list]
143
- for im_list in image_lists
144
- ]
145
- total_batch_size = len(output_images)
146
- max_num_images = max([len(img_l) for img_l in output_images])
147
- if max_num_images > 0:
148
- max_height = max([i.size(2) for img_l in output_images for i in img_l])
149
- max_width = max([i.size(3) for img_l in output_images for i in img_l])
150
- padded_image_tensor = torch.zeros(total_batch_size, max_num_images, 3, max_height, max_width)
151
- padded_pixel_attention_masks = torch.zeros(
152
- total_batch_size, max_num_images, max_height, max_width, dtype=torch.bool
153
- )
154
- for batch_idx, img_l in enumerate(output_images):
155
- for img_idx, img in enumerate(img_l):
156
- im_height, im_width = img.size()[2:]
157
- padded_image_tensor[batch_idx, img_idx, :, :im_height, :im_width] = img
158
- padded_pixel_attention_masks[batch_idx, img_idx, :im_height, :im_width] = True
159
-
160
- inputs["pixel_values"] = padded_image_tensor
161
- inputs["pixel_attention_mask"] = padded_pixel_attention_masks
162
-
163
- return inputs
164
-
165
-
166
  # Chatbot utils
167
- def is_image(string: str) -> bool:
168
- """
169
- There are two ways for images: local image path or url.
170
- """
171
- return is_url(string) or string.startswith(DEFAULT_TEMP_DIR)
172
-
173
-
174
- def is_url(string: str) -> bool:
175
- """
176
- Checks if the passed string contains a valid url and nothing else. e.g. if space is included it's immediately
177
- invalidated the url
178
- """
179
- if " " in string:
180
- return False
181
- result = urlparse(string)
182
- return all([result.scheme, result.netloc])
183
-
184
-
185
- def prompt_list_to_model_input(prompt_list: List[str]) -> Tuple[str, List[Image.Image]]:
186
- """
187
- Create the final input string and image list to feed to the model.
188
- """
189
- images = []
190
- for idx, part in enumerate(prompt_list):
191
- if is_image(part):
192
- images.append(Image.open(part))
193
- prompt_list[idx] = f"{FAKE_TOK_AROUND_IMAGE}{'<image>' * IMAGE_SEQ_LEN}{FAKE_TOK_AROUND_IMAGE}"
194
- input_text = "".join(prompt_list)
195
- input_text = input_text.replace(FAKE_TOK_AROUND_IMAGE * 2, FAKE_TOK_AROUND_IMAGE)
196
- input_text = BOS_TOKEN + input_text.strip()
197
- return input_text, images
198
-
199
-
200
  def turn_is_pure_media(turn):
201
  return turn[1] is None
202
 
203
 
204
  def format_user_prompt_with_im_history_and_system_conditioning(
205
  user_prompt, chat_history
206
- ) -> List[str]:
207
  """
208
  Produces the resulting list that needs to go inside the processor.
209
  It handles the potential image(s), the history and the system conditionning.
@@ -212,30 +83,56 @@ def format_user_prompt_with_im_history_and_system_conditioning(
212
 
213
  # Format history
214
  for turn in chat_history:
 
 
 
 
 
 
 
 
215
  if turn_is_pure_media(turn):
216
  media = turn[0][0]
217
- if resulting_list == [] or (resulting_list != [] and resulting_list[-1].endswith("<end_of_utterance>")):
218
- resulting_list.append("\nUser:")
219
- resulting_list.append(media)
220
  else:
221
  user_utterance, assistant_utterance = turn
222
- if resulting_list and is_image(resulting_list[-1]): # means that previous `turn` in `chat_history` was a pure media
223
- resulting_list.append(f"{user_utterance.strip()}<end_of_utterance>\nAssistant: {assistant_utterance}<end_of_utterance>")
224
- else:
225
- resulting_list.append(f"\nUser: {user_utterance.strip()}<end_of_utterance>\nAssistant: {assistant_utterance}<end_of_utterance>")
 
 
 
226
 
227
  # Format current input
228
  if not user_prompt["files"]:
229
- resulting_list.append(f"\nUser: ")
 
 
 
 
 
230
  else:
231
- # Choosing to put the image first when the image is inputted through the UI, but this is an arbiratrary choice.
232
- resulting_list.append("\nUser:")
233
- resulting_list.extend([im["path"] for im in user_prompt["files"]])
234
- resulting_list.append(f"{user_prompt['text']}<end_of_utterance>\nAssistant:")
 
 
 
235
 
236
  return resulting_list
237
 
238
 
 
 
 
 
 
 
 
 
 
239
  @spaces.GPU(duration=180)
240
  def model_inference(
241
  user_prompt,
@@ -257,11 +154,6 @@ def model_inference(
257
  if not file["mime_type"].startswith("image/"):
258
  gr.Error("Idefics2 only supports images. Please input a valid image.")
259
 
260
- formated_prompt_list = format_user_prompt_with_im_history_and_system_conditioning(
261
- user_prompt=user_prompt,
262
- chat_history=chat_history,
263
- )
264
-
265
  streamer = TextIteratorStreamer(
266
  PROCESSOR.tokenizer,
267
  skip_prompt=True,
@@ -289,37 +181,38 @@ def model_inference(
289
  generation_args["do_sample"] = True
290
  generation_args["top_p"] = top_p
291
 
292
-
293
  # Creating model inputs
294
- input_text, images = prompt_list_to_model_input(formated_prompt_list)
295
- inputs = create_model_inputs([input_text], [images])
 
 
 
296
  inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
297
  generation_args.update(inputs)
298
 
299
  # # The regular non streaming generation mode
300
  # _ = generation_args.pop("streamer")
301
  # generated_ids = MODELS[model_selector].generate(**generation_args)
302
- # generated_text = PROCESSOR.batch_decode(generated_ids, skip_special_tokens=True)[0]
303
  # return generated_text
304
 
 
305
  thread = Thread(
306
  target=MODELS[model_selector].generate,
307
  kwargs=generation_args,
308
  )
309
  thread.start()
310
 
311
- print("start generating")
312
  acc_text = ""
313
- try:
314
- for text_token in streamer:
315
- acc_text += text_token
316
- time.sleep(0.03)
317
- yield acc_text
318
- except Exception as e:
319
- print("error")
320
- gr.Error(e)
321
- print(f"Success! Generated the following sequence: `{acc_text}`")
322
-
323
 
324
 
325
  # Hyper-parameters for generation
@@ -373,7 +266,7 @@ top_p = gr.Slider(
373
  chatbot = gr.Chatbot(
374
  label="IDEFICS2",
375
  avatar_images=[None, BOT_AVATAR],
376
- height=500,
377
  )
378
 
379
 
 
6
  import torch
7
 
8
  from threading import Thread
9
+ from typing import List, Dict, Union
10
  from urllib.parse import urlparse
11
  from PIL import Image
12
 
13
  import gradio as gr
14
+ from transformers import AutoProcessor, TextIteratorStreamer
15
+ from transformers import Idefics2ForConditionalGeneration
 
 
16
 
17
+ # Install flash attention
18
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
19
+ # Install private transformers fork which is the only place where idefics2 has been integrated at the time being
20
+ subprocess.run(f"pip install git+https://VictorSanh:{os.environ['TRANSFORMERS_NEW_MODEL_ADDITION_TOKEN']}@github.com/huggingface/new-model-addition.git@fae11925a79d34fb0a9d6562941cacc177bd3f53", shell=True)
21
 
22
  DEVICE = torch.device("cuda")
23
  MODELS = {
24
+ "idefics2-8b (sft)": Idefics2ForConditionalGeneration.from_pretrained(
25
+ "/fsx/m4/victor/idefics2-8b",
 
26
  torch_dtype=torch.bfloat16,
27
+ _attn_implementation="flash_attention_2",
 
 
 
 
28
  trust_remote_code=True,
 
29
  token=os.environ["HF_AUTH_TOKEN"],
 
30
  ).to(DEVICE),
 
 
 
 
 
 
 
31
  }
32
  PROCESSOR = AutoProcessor.from_pretrained(
33
+ "HuggingFaceM4/idefics2-tfrm-compatible",
34
  token=os.environ["HF_AUTH_TOKEN"],
35
  )
 
 
36
  BAD_WORDS_IDS = PROCESSOR.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
37
  EOS_WORDS_IDS = PROCESSOR.tokenizer("<end_of_utterance>", add_special_tokens=False).input_ids + [PROCESSOR.tokenizer.eos_token_id]
 
38
 
39
+ SYSTEM_PROMPT = [ # Deactivating the system propmpt for now, but if I were to reactivate it, I would need to a/ transform turns into dict for applying the chat template, b/ manually overwrite the `default_template` to add the first line (that is not part of any turns), in particular for handling the bos_token.
40
  # """The following is a conversation between a highly knowledgeable and intelligent visual AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant has the ability to perceive images and reason about the content of visual inputs. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.
41
 
42
  # The conversation begins:""",
 
67
  BOT_AVATAR = "IDEFICS_logo.png"
68
 
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  # Chatbot utils
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  def turn_is_pure_media(turn):
72
  return turn[1] is None
73
 
74
 
75
  def format_user_prompt_with_im_history_and_system_conditioning(
76
  user_prompt, chat_history
77
+ ) -> List[Dict[str, Union[List, str]]]:
78
  """
79
  Produces the resulting list that needs to go inside the processor.
80
  It handles the potential image(s), the history and the system conditionning.
 
83
 
84
  # Format history
85
  for turn in chat_history:
86
+ if not resulting_list or (resulting_list and resulting_list[-1]["role"] != "user"):
87
+ resulting_list.append(
88
+ {
89
+ "role": "user",
90
+ "content": [],
91
+ }
92
+ )
93
+
94
  if turn_is_pure_media(turn):
95
  media = turn[0][0]
96
+ resulting_list[-1]["content"].append(Image.open(media))
 
 
97
  else:
98
  user_utterance, assistant_utterance = turn
99
+ resulting_list[-1]["content"].append(user_utterance.strip())
100
+ resulting_list.append(
101
+ {
102
+ "role": "assistant",
103
+ "content": [assistant_utterance]
104
+ }
105
+ )
106
 
107
  # Format current input
108
  if not user_prompt["files"]:
109
+ resulting_list.append(
110
+ {
111
+ "role": "user",
112
+ "content": [user_prompt['text']],
113
+ }
114
+ )
115
  else:
116
+ # Choosing to put the image first (i.e. before the text), but this is an arbiratrary choice.
117
+ resulting_list.append(
118
+ {
119
+ "role": "user",
120
+ "content": [Image.open(im['path']) for im in user_prompt['files']] + [user_prompt['text']],
121
+ }
122
+ )
123
 
124
  return resulting_list
125
 
126
 
127
+ def extract_images_from_msg_list(msg_list):
128
+ all_images = []
129
+ for msg in msg_list:
130
+ for c_ in msg["content"]:
131
+ if isinstance(c_, Image.Image):
132
+ all_images.append(c_)
133
+ return all_images
134
+
135
+
136
  @spaces.GPU(duration=180)
137
  def model_inference(
138
  user_prompt,
 
154
  if not file["mime_type"].startswith("image/"):
155
  gr.Error("Idefics2 only supports images. Please input a valid image.")
156
 
 
 
 
 
 
157
  streamer = TextIteratorStreamer(
158
  PROCESSOR.tokenizer,
159
  skip_prompt=True,
 
181
  generation_args["do_sample"] = True
182
  generation_args["top_p"] = top_p
183
 
 
184
  # Creating model inputs
185
+ formated_prompt_list = format_user_prompt_with_im_history_and_system_conditioning(
186
+ user_prompt=user_prompt,
187
+ chat_history=chat_history,
188
+ )
189
+ inputs = PROCESSOR.apply_chat_template(formated_prompt_list, add_generation_prompt=True, return_tensors="pt")
190
  inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
191
  generation_args.update(inputs)
192
 
193
  # # The regular non streaming generation mode
194
  # _ = generation_args.pop("streamer")
195
  # generated_ids = MODELS[model_selector].generate(**generation_args)
196
+ # generated_text = PROCESSOR.batch_decode(generated_ids[:, generation_args["input_ids"].size(-1): ], skip_special_tokens=True)[0]
197
  # return generated_text
198
 
199
+ # The streaming generation mode
200
  thread = Thread(
201
  target=MODELS[model_selector].generate,
202
  kwargs=generation_args,
203
  )
204
  thread.start()
205
 
206
+ print("Start generating")
207
  acc_text = ""
208
+ for text_token in streamer:
209
+ time.sleep(0.04)
210
+ acc_text += text_token
211
+ if acc_text.endswith("<end_of_utterance>"):
212
+ acc_text = acc_text[:-18]
213
+ yield acc_text
214
+ print("Success - generated the following text:", acc_text)
215
+ print("-----")
 
 
216
 
217
 
218
  # Hyper-parameters for generation
 
266
  chatbot = gr.Chatbot(
267
  label="IDEFICS2",
268
  avatar_images=[None, BOT_AVATAR],
269
+ height=750,
270
  )
271
 
272
 
old_app_dialogue.py → idefics1_app_dialogue.py RENAMED
File without changes
the_updated_app_with_tfrm_integration.py → idefics2_old_app_dialogue.py RENAMED
@@ -6,43 +6,53 @@ import time
6
  import torch
7
 
8
  from threading import Thread
9
- from typing import List, Dict, Union
10
  from urllib.parse import urlparse
11
  from PIL import Image
12
 
13
  import gradio as gr
14
- from transformers import AutoProcessor, TextIteratorStreamer
15
- from transformers import Idefics2ForConditionalGeneration
 
 
16
 
17
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
18
 
19
  DEVICE = torch.device("cuda")
20
  MODELS = {
21
- "idefics2 lima 200": Idefics2ForConditionalGeneration.from_pretrained(
22
- "HuggingFaceM4/idefics2-tfrm-compatible",
23
- torch_dtype=torch.bfloat16,
24
- _attn_implementation="flash_attention_2",
25
  trust_remote_code=True,
 
26
  token=os.environ["HF_AUTH_TOKEN"],
27
- revision="11794e2ae02dbf1c55d0ebd92c28e5b0b604cf5f",
28
  ).to(DEVICE),
29
- "idefics2 sft 12600": Idefics2ForConditionalGeneration.from_pretrained(
30
- "HuggingFaceM4/idefics2-tfrm-compatible",
31
- torch_dtype=torch.bfloat16,
32
- _attn_implementation="flash_attention_2",
33
  trust_remote_code=True,
 
34
  token=os.environ["HF_AUTH_TOKEN"],
35
- revision="86f134822798266d0d8db049cc6458c625e32344",
36
  ).to(DEVICE),
 
 
 
 
 
 
 
37
  }
38
  PROCESSOR = AutoProcessor.from_pretrained(
39
- "HuggingFaceM4/idefics2-tfrm-compatible",
40
  token=os.environ["HF_AUTH_TOKEN"],
41
  )
 
 
42
  BAD_WORDS_IDS = PROCESSOR.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
43
  EOS_WORDS_IDS = PROCESSOR.tokenizer("<end_of_utterance>", add_special_tokens=False).input_ids + [PROCESSOR.tokenizer.eos_token_id]
 
44
 
45
- SYSTEM_PROMPT = [ # Deactivating the system propmpt for now, but if I were to reactivate it, I would need to a/ transform turns into dict for applying the chat template, b/ manually overwrite the `default_template` to add the first line (that is not part of any turns), in particular for handling the bos_token.
46
  # """The following is a conversation between a highly knowledgeable and intelligent visual AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant has the ability to perceive images and reason about the content of visual inputs. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.
47
 
48
  # The conversation begins:""",
@@ -73,14 +83,127 @@ API_TOKEN = os.getenv("HF_AUTH_TOKEN")
73
  BOT_AVATAR = "IDEFICS_logo.png"
74
 
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  # Chatbot utils
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  def turn_is_pure_media(turn):
78
  return turn[1] is None
79
 
80
 
81
  def format_user_prompt_with_im_history_and_system_conditioning(
82
  user_prompt, chat_history
83
- ) -> List[Dict[str, Union[List, str]]]:
84
  """
85
  Produces the resulting list that needs to go inside the processor.
86
  It handles the potential image(s), the history and the system conditionning.
@@ -89,56 +212,30 @@ def format_user_prompt_with_im_history_and_system_conditioning(
89
 
90
  # Format history
91
  for turn in chat_history:
92
- if not resulting_list or (resulting_list and resulting_list[-1]["role"] != "user"):
93
- resulting_list.append(
94
- {
95
- "role": "user",
96
- "content": [],
97
- }
98
- )
99
-
100
  if turn_is_pure_media(turn):
101
  media = turn[0][0]
102
- resulting_list[-1]["content"].append(Image.open(media))
 
 
103
  else:
104
  user_utterance, assistant_utterance = turn
105
- resulting_list[-1]["content"].append(user_utterance.strip())
106
- resulting_list.append(
107
- {
108
- "role": "assistant",
109
- "content": [assistant_utterance]
110
- }
111
- )
112
 
113
  # Format current input
114
  if not user_prompt["files"]:
115
- resulting_list.append(
116
- {
117
- "role": "user",
118
- "content": [user_prompt['text']],
119
- }
120
- )
121
  else:
122
- # Choosing to put the image first (i.e. before the text), but this is an arbiratrary choice.
123
- resulting_list.append(
124
- {
125
- "role": "user",
126
- "content": [Image.open(im['path']) for im in user_prompt['files']] + [user_prompt['text']],
127
- }
128
- )
129
 
130
  return resulting_list
131
 
132
 
133
- def extract_images_from_msg_list(msg_list):
134
- all_images = []
135
- for msg in msg_list:
136
- for c_ in msg["content"]:
137
- if isinstance(c_, Image.Image):
138
- all_images.append(c_)
139
- return all_images
140
-
141
-
142
  @spaces.GPU(duration=180)
143
  def model_inference(
144
  user_prompt,
@@ -160,6 +257,11 @@ def model_inference(
160
  if not file["mime_type"].startswith("image/"):
161
  gr.Error("Idefics2 only supports images. Please input a valid image.")
162
 
 
 
 
 
 
163
  streamer = TextIteratorStreamer(
164
  PROCESSOR.tokenizer,
165
  skip_prompt=True,
@@ -187,22 +289,19 @@ def model_inference(
187
  generation_args["do_sample"] = True
188
  generation_args["top_p"] = top_p
189
 
 
190
  # Creating model inputs
191
- formated_prompt_list = format_user_prompt_with_im_history_and_system_conditioning(
192
- user_prompt=user_prompt,
193
- chat_history=chat_history,
194
- )
195
- inputs = PROCESSOR.apply_chat_template(formated_prompt_list, add_generation_prompt=True, return_tensors="pt")
196
  inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
197
  generation_args.update(inputs)
198
 
199
  # # The regular non streaming generation mode
200
  # _ = generation_args.pop("streamer")
201
  # generated_ids = MODELS[model_selector].generate(**generation_args)
202
- # generated_text = PROCESSOR.batch_decode(generated_ids[:, generation_args["input_ids"].size(-1): ], skip_special_tokens=True)[0]
203
  # return generated_text
204
 
205
- # The streaming generation mode
206
  thread = Thread(
207
  target=MODELS[model_selector].generate,
208
  kwargs=generation_args,
@@ -211,13 +310,16 @@ def model_inference(
211
 
212
  print("start generating")
213
  acc_text = ""
214
- for text_token in streamer:
215
- time.sleep(0.04)
216
- acc_text += text_token
217
- if acc_text.endswith("<end_of_utterance>"):
218
- acc_text = acc_text[:-18]
219
- yield acc_text
220
- print("success - generated the following text:", acc_text)
 
 
 
221
 
222
 
223
  # Hyper-parameters for generation
@@ -271,7 +373,7 @@ top_p = gr.Slider(
271
  chatbot = gr.Chatbot(
272
  label="IDEFICS2",
273
  avatar_images=[None, BOT_AVATAR],
274
- height=750,
275
  )
276
 
277
 
 
6
  import torch
7
 
8
  from threading import Thread
9
+ from typing import List, Tuple
10
  from urllib.parse import urlparse
11
  from PIL import Image
12
 
13
  import gradio as gr
14
+ from gradio_client.client import DEFAULT_TEMP_DIR
15
+ from transformers import AutoProcessor, AutoModelForCausalLM, TextIteratorStreamer
16
+ from transformers.image_utils import to_numpy_array, PILImageResampling, ChannelDimension
17
+ from transformers.image_transforms import resize, to_channel_dimension_format
18
 
19
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
20
 
21
  DEVICE = torch.device("cuda")
22
  MODELS = {
23
+ "tr_290_bis_288_cinco_chatty - opt 150": AutoModelForCausalLM.from_pretrained(
24
+ "HuggingFaceM4/idefics2",
 
 
25
  trust_remote_code=True,
26
+ torch_dtype=torch.bfloat16,
27
  token=os.environ["HF_AUTH_TOKEN"],
28
+ revision="9e47f905a9e262451c749286fcb97516cedff6d3",
29
  ).to(DEVICE),
30
+ "tr_288_cinco_final_sft_sphinx - opt 11'000": AutoModelForCausalLM.from_pretrained(
31
+ "HuggingFaceM4/idefics2",
 
 
32
  trust_remote_code=True,
33
+ torch_dtype=torch.bfloat16,
34
  token=os.environ["HF_AUTH_TOKEN"],
35
+ revision="316ea4acf714760882ad89e364ae1f8c447ae82e",
36
  ).to(DEVICE),
37
+ # "285 - continued pretraining on text sft - opt 2'000": AutoModelForCausalLM.from_pretrained(
38
+ # "HuggingFaceM4/idefics2",
39
+ # trust_remote_code=True,
40
+ # torch_dtype=torch.bfloat16,
41
+ # token=os.environ["HF_AUTH_TOKEN"],
42
+ # revision="b0a2a564e5dc311591886bb375e8d5a1aeaade83",
43
+ # ).to(DEVICE),
44
  }
45
  PROCESSOR = AutoProcessor.from_pretrained(
46
+ "HuggingFaceM4/idefics2",
47
  token=os.environ["HF_AUTH_TOKEN"],
48
  )
49
+ FAKE_TOK_AROUND_IMAGE = "<fake_token_around_image>"
50
+ BOS_TOKEN = PROCESSOR.tokenizer.bos_token
51
  BAD_WORDS_IDS = PROCESSOR.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
52
  EOS_WORDS_IDS = PROCESSOR.tokenizer("<end_of_utterance>", add_special_tokens=False).input_ids + [PROCESSOR.tokenizer.eos_token_id]
53
+ IMAGE_SEQ_LEN = 64#list(MODELS.values())[0].config.perceiver_config.resampler_n_latents
54
 
55
+ SYSTEM_PROMPT = [
56
  # """The following is a conversation between a highly knowledgeable and intelligent visual AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant has the ability to perceive images and reason about the content of visual inputs. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.
57
 
58
  # The conversation begins:""",
 
83
  BOT_AVATAR = "IDEFICS_logo.png"
84
 
85
 
86
+ # Model processing utils - these will be handled in the model processor directly ultimately
87
+ def convert_to_rgb(image):
88
+ # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
89
+ # for transparent images. The call to `alpha_composite` handles this case
90
+ if image.mode == "RGB":
91
+ return image
92
+
93
+ image_rgba = image.convert("RGBA")
94
+ background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
95
+ alpha_composite = Image.alpha_composite(background, image_rgba)
96
+ alpha_composite = alpha_composite.convert("RGB")
97
+ return alpha_composite
98
+
99
+
100
+ def custom_transform(x):
101
+ x = convert_to_rgb(x)
102
+ x = to_numpy_array(x)
103
+
104
+ height, width = x.shape[:2]
105
+ aspect_ratio = width / height
106
+ if width >= height and width > 980:
107
+ width = 980
108
+ height = int(width / aspect_ratio)
109
+ elif height > width and height > 980:
110
+ height = 980
111
+ width = int(height * aspect_ratio)
112
+ width = max(width, 378)
113
+ height = max(height, 378)
114
+
115
+ x = resize(x, (height, width), resample=PILImageResampling.BILINEAR)
116
+ x = PROCESSOR.image_processor.rescale(x, scale=1 / 255)
117
+ x = PROCESSOR.image_processor.normalize(
118
+ x,
119
+ mean=PROCESSOR.image_processor.image_mean,
120
+ std=PROCESSOR.image_processor.image_std
121
+ )
122
+ x = to_channel_dimension_format(x, ChannelDimension.FIRST)
123
+ x = torch.tensor(x)
124
+ return x
125
+
126
+
127
+ def create_model_inputs(
128
+ input_texts: List[str],
129
+ image_lists: List[List[Image.Image]],
130
+ ):
131
+ """
132
+ All this logic will eventually be handled inside the model processor.
133
+ """
134
+ inputs = PROCESSOR.tokenizer(
135
+ input_texts,
136
+ return_tensors="pt",
137
+ add_special_tokens=False,
138
+ padding=True,
139
+ )
140
+
141
+ output_images = [
142
+ [PROCESSOR.image_processor(img, transform=custom_transform) for img in im_list]
143
+ for im_list in image_lists
144
+ ]
145
+ total_batch_size = len(output_images)
146
+ max_num_images = max([len(img_l) for img_l in output_images])
147
+ if max_num_images > 0:
148
+ max_height = max([i.size(2) for img_l in output_images for i in img_l])
149
+ max_width = max([i.size(3) for img_l in output_images for i in img_l])
150
+ padded_image_tensor = torch.zeros(total_batch_size, max_num_images, 3, max_height, max_width)
151
+ padded_pixel_attention_masks = torch.zeros(
152
+ total_batch_size, max_num_images, max_height, max_width, dtype=torch.bool
153
+ )
154
+ for batch_idx, img_l in enumerate(output_images):
155
+ for img_idx, img in enumerate(img_l):
156
+ im_height, im_width = img.size()[2:]
157
+ padded_image_tensor[batch_idx, img_idx, :, :im_height, :im_width] = img
158
+ padded_pixel_attention_masks[batch_idx, img_idx, :im_height, :im_width] = True
159
+
160
+ inputs["pixel_values"] = padded_image_tensor
161
+ inputs["pixel_attention_mask"] = padded_pixel_attention_masks
162
+
163
+ return inputs
164
+
165
+
166
  # Chatbot utils
167
+ def is_image(string: str) -> bool:
168
+ """
169
+ There are two ways for images: local image path or url.
170
+ """
171
+ return is_url(string) or string.startswith(DEFAULT_TEMP_DIR)
172
+
173
+
174
+ def is_url(string: str) -> bool:
175
+ """
176
+ Checks if the passed string contains a valid url and nothing else. e.g. if space is included it's immediately
177
+ invalidated the url
178
+ """
179
+ if " " in string:
180
+ return False
181
+ result = urlparse(string)
182
+ return all([result.scheme, result.netloc])
183
+
184
+
185
+ def prompt_list_to_model_input(prompt_list: List[str]) -> Tuple[str, List[Image.Image]]:
186
+ """
187
+ Create the final input string and image list to feed to the model.
188
+ """
189
+ images = []
190
+ for idx, part in enumerate(prompt_list):
191
+ if is_image(part):
192
+ images.append(Image.open(part))
193
+ prompt_list[idx] = f"{FAKE_TOK_AROUND_IMAGE}{'<image>' * IMAGE_SEQ_LEN}{FAKE_TOK_AROUND_IMAGE}"
194
+ input_text = "".join(prompt_list)
195
+ input_text = input_text.replace(FAKE_TOK_AROUND_IMAGE * 2, FAKE_TOK_AROUND_IMAGE)
196
+ input_text = BOS_TOKEN + input_text.strip()
197
+ return input_text, images
198
+
199
+
200
  def turn_is_pure_media(turn):
201
  return turn[1] is None
202
 
203
 
204
  def format_user_prompt_with_im_history_and_system_conditioning(
205
  user_prompt, chat_history
206
+ ) -> List[str]:
207
  """
208
  Produces the resulting list that needs to go inside the processor.
209
  It handles the potential image(s), the history and the system conditionning.
 
212
 
213
  # Format history
214
  for turn in chat_history:
 
 
 
 
 
 
 
 
215
  if turn_is_pure_media(turn):
216
  media = turn[0][0]
217
+ if resulting_list == [] or (resulting_list != [] and resulting_list[-1].endswith("<end_of_utterance>")):
218
+ resulting_list.append("\nUser:")
219
+ resulting_list.append(media)
220
  else:
221
  user_utterance, assistant_utterance = turn
222
+ if resulting_list and is_image(resulting_list[-1]): # means that previous `turn` in `chat_history` was a pure media
223
+ resulting_list.append(f"{user_utterance.strip()}<end_of_utterance>\nAssistant: {assistant_utterance}<end_of_utterance>")
224
+ else:
225
+ resulting_list.append(f"\nUser: {user_utterance.strip()}<end_of_utterance>\nAssistant: {assistant_utterance}<end_of_utterance>")
 
 
 
226
 
227
  # Format current input
228
  if not user_prompt["files"]:
229
+ resulting_list.append(f"\nUser: ")
 
 
 
 
 
230
  else:
231
+ # Choosing to put the image first when the image is inputted through the UI, but this is an arbiratrary choice.
232
+ resulting_list.append("\nUser:")
233
+ resulting_list.extend([im["path"] for im in user_prompt["files"]])
234
+ resulting_list.append(f"{user_prompt['text']}<end_of_utterance>\nAssistant:")
 
 
 
235
 
236
  return resulting_list
237
 
238
 
 
 
 
 
 
 
 
 
 
239
  @spaces.GPU(duration=180)
240
  def model_inference(
241
  user_prompt,
 
257
  if not file["mime_type"].startswith("image/"):
258
  gr.Error("Idefics2 only supports images. Please input a valid image.")
259
 
260
+ formated_prompt_list = format_user_prompt_with_im_history_and_system_conditioning(
261
+ user_prompt=user_prompt,
262
+ chat_history=chat_history,
263
+ )
264
+
265
  streamer = TextIteratorStreamer(
266
  PROCESSOR.tokenizer,
267
  skip_prompt=True,
 
289
  generation_args["do_sample"] = True
290
  generation_args["top_p"] = top_p
291
 
292
+
293
  # Creating model inputs
294
+ input_text, images = prompt_list_to_model_input(formated_prompt_list)
295
+ inputs = create_model_inputs([input_text], [images])
 
 
 
296
  inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
297
  generation_args.update(inputs)
298
 
299
  # # The regular non streaming generation mode
300
  # _ = generation_args.pop("streamer")
301
  # generated_ids = MODELS[model_selector].generate(**generation_args)
302
+ # generated_text = PROCESSOR.batch_decode(generated_ids, skip_special_tokens=True)[0]
303
  # return generated_text
304
 
 
305
  thread = Thread(
306
  target=MODELS[model_selector].generate,
307
  kwargs=generation_args,
 
310
 
311
  print("start generating")
312
  acc_text = ""
313
+ try:
314
+ for text_token in streamer:
315
+ acc_text += text_token
316
+ time.sleep(0.03)
317
+ yield acc_text
318
+ except Exception as e:
319
+ print("error")
320
+ gr.Error(e)
321
+ print(f"Success! Generated the following sequence: `{acc_text}`")
322
+
323
 
324
 
325
  # Hyper-parameters for generation
 
373
  chatbot = gr.Chatbot(
374
  label="IDEFICS2",
375
  avatar_images=[None, BOT_AVATAR],
376
+ height=500,
377
  )
378
 
379