VictorSanh commited on
Commit
ac8d53f
1 Parent(s): f20bb4d

Update visualization

Browse files
Files changed (5) hide show
  1. app_bis.py +0 -857
  2. app_dialogue.py +279 -119
  3. elon_musk.md +0 -23
  4. hello.py +0 -73
  5. requirements.txt +1 -1
app_bis.py DELETED
@@ -1,857 +0,0 @@
1
- import logging
2
- import os
3
- import re
4
-
5
- import time
6
- from io import BytesIO
7
-
8
- import gradio as gr
9
- import requests
10
- import torch
11
- import transformers
12
- from accelerate.utils import get_max_memory
13
-
14
- from joblib import Parallel, delayed
15
- from PIL import Image
16
- from transformers import AutoTokenizer
17
-
18
- from m4.models.vbloom import configuration_vbloom, modeling_vbloom
19
- from m4.models.vgpt2 import configuration_vgpt2, modeling_vgpt2
20
- from m4.models.vgpt_neo import configuration_vgpt_neo, modeling_vgpt_neo
21
- from m4.models.vllama import configuration_vllama, modeling_vllama
22
- from m4.models.vopt import configuration_vopt, modeling_vopt
23
- from m4.training.packing import image_attention_mask_for_packed_input_ids, incremental_to_binary_attention_mask
24
- from m4.training.utils import build_image_transform
25
-
26
-
27
- logging.basicConfig(level=logging.INFO)
28
- logger = logging.getLogger()
29
-
30
- CURRENT_MODEL = "tr_209_ift_mixture_opt_step-2000"
31
-
32
- MAX_TRIES = 3
33
- TOKENIZER_FAST = True
34
- MAX_SEQ_LEN = 1024
35
- model, tokenizer = None, None
36
-
37
-
38
- MODEL_TO_DISPLAY_NAME = {
39
- "tr_199_w_xattn_opt_step-65000": "VLlama - tr_199_w_xattn_opt_step-65000",
40
- "tr_201_sft_on_lrv_opt_step-15000": "VLlama - tr_201_sft_on_lrv_opt_step-15000",
41
- "tr_202bis_ift_llava_all_unfrozen_opt_step-14128": "VLlama - tr_202bis_ift_llava_all_unfrozen_opt_step-14128",
42
- "tr_203_ift_m3it_opt_step-50000": "VLlama - tr_203_ift_m3it_opt_step-50000",
43
- "tr_205_sft_ultrachat_opt_step-20000": "VLlama - tr_205_sft_ultrachat_opt_step-20000",
44
- "tr_207_ift_svit_opt_step-14627": "VLlama - tr_207_ift_svit_opt_step-14627",
45
- "tr_209_ift_mixture_opt_step-2000": "VLlama - tr_209_ift_mixture_opt_step-2000",
46
- }
47
- MODEL_TO_MODEL_CLASS = {
48
- "tr_199_w_xattn_opt_step-65000": "VLlamaForCausalLM",
49
- "tr_201_sft_on_lrv_opt_step-15000": "VLlamaForCausalLM",
50
- "tr_202bis_ift_llava_all_unfrozen_opt_step-14128": "VLlamaForCausalLM",
51
- "tr_203_ift_m3it_opt_step-50000": "VLlamaForCausalLM",
52
- "tr_205_sft_ultrachat_opt_step-20000": "VLlamaForCausalLM",
53
- "tr_207_ift_svit_opt_step-14627": "VLlamaForCausalLM",
54
- "tr_209_ift_mixture_opt_step-2000": "VLlamaForCausalLM",
55
- }
56
-
57
- MODEL_TO_CONFIG_CLASS = {
58
- "tr_199_w_xattn_opt_step-65000": "VLlamaConfig",
59
- "tr_201_sft_on_lrv_opt_step-15000": "VLlamaConfig",
60
- "tr_202bis_ift_llava_all_unfrozen_opt_step-14128": "VLlamaConfig",
61
- "tr_203_ift_m3it_opt_step-50000": "VLlamaConfig",
62
- "tr_205_sft_ultrachat_opt_step-20000": "VLlamaConfig",
63
- "tr_207_ift_svit_opt_step-14627": "VLlamaConfig",
64
- "tr_209_ift_mixture_opt_step-2000": "VLlamaConfig",
65
- }
66
-
67
-
68
- def load_tokenizer_model(model_name, model_class):
69
- tokenizer = AutoTokenizer.from_pretrained(
70
- model_name,
71
- use_fast=TOKENIZER_FAST,
72
- use_auth_token=os.getenv("HF_AUTH_TOKEN", True), # `use_fast=False` for 1B3 OPT, True for all the other models
73
- )
74
- tokenizer.padding_side = "left"
75
- config_class = MODEL_TO_CONFIG_CLASS[model_name.split("/")[-1]]
76
-
77
- # assert tokenizer.is_fast
78
-
79
- supported_custom_modules = {
80
- "vgpt2": modeling_vgpt2,
81
- "vbloom": modeling_vbloom,
82
- "vgptneo": modeling_vgpt_neo,
83
- "vopt": modeling_vopt,
84
- "vllama": modeling_vllama,
85
- }
86
- supported_custom_configs = {
87
- "vgpt2": configuration_vgpt2,
88
- "vbloom": configuration_vbloom,
89
- "vgptneo": configuration_vgpt_neo,
90
- "vopt": configuration_vopt,
91
- "vllama": configuration_vllama,
92
- }
93
- parent_config_class = (
94
- [v for k, v in supported_custom_configs.items() if k in model_class.lower()] + [transformers]
95
- )[0]
96
- parent_model_class = (
97
- [v for k, v in supported_custom_modules.items() if k in model_class.lower()] + [transformers]
98
- )[0]
99
- config_class = getattr(parent_config_class, config_class)
100
- model_class = getattr(parent_model_class, model_class)
101
- config = config_class.from_pretrained(model_name, use_auth_token=os.getenv("HF_AUTH_TOKEN", True))
102
- max_memory_map = get_max_memory()
103
- for key in max_memory_map.keys():
104
- if key != "cpu":
105
- # Get this in GB
106
- max_memory_map[key] = max_memory_map[key] // (1024 * 1024 * 1024)
107
- # Decrease 2 for Pytorch overhead and 2 for the forward to be safe
108
- max_memory_map[key] = f"{max_memory_map[key] - 4} GiB"
109
- model = model_class.from_pretrained(
110
- model_name,
111
- use_auth_token=os.getenv("HF_AUTH_TOKEN", True),
112
- device_map="auto",
113
- offload_folder="./offload",
114
- torch_dtype=config.torch_dtype,
115
- max_memory=max_memory_map,
116
- )
117
- model.eval()
118
- print("Current device map:", model.hf_device_map)
119
- print("Model default generation config:", model.generation_config)
120
- # TODO: the device_map looks very inefficien right now. that could be improved
121
- # it typically looks like that
122
- # {
123
- # 'model.embed_tokens': 0,
124
- # 'model.vision_model': 0,
125
- # 'model.layers.0': 0,
126
- # 'model.layers.1': 0,
127
- # 'model.layers.2': 0,
128
- # 'model.layers.3': 0,
129
- # 'model.layers.4': 0,
130
- # 'model.layers.5': 0,
131
- # 'model.layers.6': 1,
132
- # 'model.layers.7': 1,
133
- # 'model.layers.8': 1,
134
- # 'model.layers.9': 1,
135
- # 'model.layers.10': 1,
136
- # 'model.layers.11': 1,
137
- # 'model.layers.12': 1,
138
- # 'model.layers.13': 1,
139
- # 'model.layers.14': 1,
140
- # 'model.layers.15': 1,
141
- # 'model.layers.16': 1,
142
- # 'model.layers.17': 2,
143
- # 'model.layers.18': 2,
144
- # 'model.layers.19': 2,
145
- # 'model.layers.20': 2,
146
- # 'model.layers.21': 2,
147
- # 'model.layers.22': 2,
148
- # 'model.layers.23': 2,
149
- # 'model.layers.24': 2,
150
- # 'model.layers.25': 2,
151
- # 'model.layers.26': 2,
152
- # 'model.layers.27': 2,
153
- # 'model.layers.28': 3,
154
- # 'model.layers.29': 3,
155
- # 'model.layers.30': 3,
156
- # 'model.layers.31': 3,
157
- # 'model.gated_cross_attn_layers.0': 3,
158
- # 'model.gated_cross_attn_layers.1': 3,
159
- # 'model.gated_cross_attn_layers.2': 3,
160
- # 'model.gated_cross_attn_layers.3': 3,
161
- # 'model.gated_cross_attn_layers.4': 3,
162
- # 'model.gated_cross_attn_layers.5': 3,
163
- # 'model.gated_cross_attn_layers.6': 3,
164
- # 'model.gated_cross_attn_layers.7': 3,
165
- # 'model.gated_cross_attn_layers.8': 4,
166
- # 'model.gated_cross_attn_layers.9': 4,
167
- # 'model.gated_cross_attn_layers.10': 4,
168
- # 'model.gated_cross_attn_layers.11': 4,
169
- # 'model.gated_cross_attn_layers.12': 4,
170
- # 'model.gated_cross_attn_layers.13': 4,
171
- # 'model.gated_cross_attn_layers.14': 4,
172
- # 'model.gated_cross_attn_layers.15': 4,
173
- # 'model.norm': 4,
174
- # 'lm_head': 4
175
- # } which means there is a lot of things going around between the gated cross attention layers and the LM layers...
176
- return tokenizer, model
177
-
178
-
179
- MODEL_TO_SPACE_MAPPING = {}
180
- IS_MAIN_SPACE = CURRENT_MODEL not in MODEL_TO_MODEL_CLASS
181
- if IS_MAIN_SPACE:
182
- for model in MODEL_TO_MODEL_CLASS:
183
- MODEL_TO_SPACE_MAPPING[model] = gr.Blocks.load(
184
- name=f"spaces/HuggingFaceM4/{model}", api_key=os.getenv("HF_AUTH_TOKEN", True)
185
- )
186
- else:
187
- model_path = f"HuggingFaceM4/{CURRENT_MODEL}"
188
- tokenizer, model = load_tokenizer_model(model_path, MODEL_TO_MODEL_CLASS[CURRENT_MODEL])
189
-
190
-
191
- def fetch_images(url_images):
192
- images = []
193
- for url in url_images:
194
- if isinstance(url, str):
195
- images.append(Image.open(BytesIO(requests.get(url, stream=True).content)))
196
- else:
197
- images.append(url)
198
- return images
199
-
200
-
201
- def model_generation(
202
- prompt,
203
- images,
204
- tokenizer,
205
- model,
206
- temperature,
207
- no_repeat_ngram_size,
208
- max_new_tokens,
209
- min_length,
210
- ban_tokens,
211
- forced_eos_token_id,
212
- eos_tokens,
213
- force_words,
214
- length_penalty,
215
- repetition_penalty,
216
- hide_special_tokens,
217
- stop_generation,
218
- decoding_strategy,
219
- num_beams,
220
- top_k,
221
- top_p,
222
- penalty_alpha,
223
- ):
224
- # Preparing inputs
225
- tokens = tokenizer(
226
- [prompt],
227
- truncation=True,
228
- max_length=MAX_SEQ_LEN,
229
- padding=True,
230
- add_special_tokens=False,
231
- )
232
-
233
- input_ids = torch.tensor([[tokenizer.bos_token_id] + tokens.input_ids[0]])
234
- attention_mask = torch.tensor([[1] + tokens.attention_mask[0]])
235
-
236
- image_attention_mask = [
237
- incremental_to_binary_attention_mask(
238
- image_attention_mask_for_packed_input_ids(input_ids[0].unsqueeze(0), tokenizer)[0], num_classes=len(images)
239
- )
240
- ]
241
-
242
- image_transform = build_image_transform(eval=True)
243
- pixel_values = [torch.stack([image_transform(img) for img in images])]
244
-
245
- input_ids = input_ids.to(0)
246
- attention_mask = attention_mask.to(0)
247
- pixel_values = torch.stack(pixel_values).to(0)
248
- image_attention_mask = torch.cat(image_attention_mask, 0).to(0)
249
-
250
- # Excluding some words from the generation
251
- bad_words_ids = None
252
- ban_tokens = ban_tokens.replace("\\n", "\n")
253
- bad_words = ban_tokens.split(";")
254
- if len(bad_words) > 0:
255
- bad_words_ids = tokenizer(bad_words, add_special_tokens=False).input_ids
256
-
257
- # Forcing some words in the generation
258
- force_words_ids = None
259
- if force_words != "":
260
- force_words = force_words.replace("\\n", "\n")
261
- force_words = force_words.split(";")
262
- if len(force_words) > 0:
263
- force_words_ids = tokenizer(force_words, add_special_tokens=False).input_ids
264
-
265
- # eos_token_ids = None
266
- # if eos_tokens != "":
267
- # eos_tokens = eos_tokens.replace("\\n", "\n")
268
- # eos_tokens = eos_tokens.split(";")
269
- # if len(eos_tokens) > 0:
270
- # eos_token_ids = []
271
- # for eos_token in eos_tokens:
272
- # tokenized_eos_token = tokenizer(eos_token, add_special_tokens=False).input_ids
273
- # if len(tokenized_eos_token) > 1:
274
- # raise ValueError(
275
- # f"eos_tokens should be one token, here {eos_token} is {len(tokenized_eos_token)} tokens:"
276
- # f" {tokenized_eos_token}"
277
- # )
278
- # eos_token_ids += tokenized_eos_token
279
-
280
- # if forced_eos_token_id and eos_token_ids is None:
281
- # raise ValueError("You can't use forced_eos_token_id without eos_tokens")
282
- # elif forced_eos_token_id:
283
- # forced_eos_token_id = eos_token_ids
284
- # else:
285
- # forced_eos_token_id = None
286
-
287
- # Inputs
288
- input_args = {
289
- "input_ids": input_ids,
290
- "attention_mask": attention_mask,
291
- "pixel_values": pixel_values,
292
- "image_attention_mask": image_attention_mask,
293
- }
294
- # Common parameters to all decoding strategies
295
- # This documentation is useful to read: https://huggingface.co/docs/transformers/main/en/generation_strategies
296
- generation_args = {
297
- "temperature": temperature,
298
- "no_repeat_ngram_size": no_repeat_ngram_size,
299
- "max_new_tokens": max_new_tokens,
300
- "min_length": min_length,
301
- "bad_words_ids": bad_words_ids,
302
- # "forced_eos_token_id": forced_eos_token_id,
303
- "force_words_ids": force_words_ids,
304
- "length_penalty": length_penalty,
305
- "repetition_penalty": repetition_penalty,
306
- "eos_token_id": tokenizer.eos_token_id,
307
- }
308
-
309
- assert decoding_strategy in [
310
- "greedy",
311
- "beam_search",
312
- "beam_sampling",
313
- "sampling_top_k",
314
- "sampling_top_p",
315
- "contrastive_sampling",
316
- ]
317
- if decoding_strategy == "greedy":
318
- pass
319
- elif decoding_strategy == "beam_search":
320
- generation_args["num_beams"] = num_beams
321
- assert generation_args["num_beams"] > 1
322
- elif decoding_strategy == "beam_sampling":
323
- generation_args["num_beams"] = num_beams
324
- generation_args["do_sample"] = True
325
- assert generation_args["num_beams"] > 1
326
- elif decoding_strategy == "sampling_top_k":
327
- generation_args["do_sample"] = True
328
- generation_args["top_k"] = top_k
329
- elif decoding_strategy == "sampling_top_p":
330
- generation_args["do_sample"] = True
331
- generation_args["top_p"] = top_p
332
- elif decoding_strategy == "contrastive_sampling":
333
- generation_args["do_sample"] = True
334
- generation_args["penalty_alpha"] = penalty_alpha
335
- generation_args["top_k"] = top_k
336
-
337
- generated_tokens = model.generate(
338
- **input_args,
339
- **generation_args,
340
- )
341
- tokens = tokenizer.convert_ids_to_tokens(generated_tokens[0])
342
- decoded_skip_special_tokens = repr(
343
- tokenizer.batch_decode(generated_tokens, skip_special_tokens=hide_special_tokens)[0]
344
- )
345
- decoded = repr(tokenizer.batch_decode(generated_tokens)[0])
346
- logger.info(
347
- "Result: \n"
348
- f"Prompt: `{prompt}`\n"
349
- f"Tokens ids from prompt + generation: `{generated_tokens[0].tolist()}`\n"
350
- f"Tokens (converted) from prompt + generation: `{tokens}`\n"
351
- f"String decoded with skipped special tokens: `{decoded_skip_special_tokens}`\n"
352
- f"String decoded: `{decoded}`\n"
353
- f"Generation mode: `{decoding_strategy}`\n"
354
- f"Generation parameters: `{generation_args}`\n"
355
- )
356
-
357
- original_prompt = generated_tokens[:, : input_ids.shape[-1]]
358
- actual_generated_tokens = generated_tokens[:, input_ids.shape[-1] :]
359
-
360
- if stop_generation:
361
- # Additional stopping criteria: generating <image> token, <end_of_text> token or <begin_of_text> token
362
- assert tokenizer.additional_special_tokens[-1] == "<image>"
363
- image_token_id = tokenizer.additional_special_tokens_ids[-1]
364
- end_of_text_token_id = tokenizer.eos_token_id
365
- begin_of_text_token_id = tokenizer.bos_token_id
366
-
367
- image_token_ids = (actual_generated_tokens == image_token_id).nonzero(as_tuple=True)[1]
368
- end_of_text_token_ids = (actual_generated_tokens == end_of_text_token_id).nonzero(as_tuple=True)[1]
369
- begin_of_text_token_ids = (actual_generated_tokens == begin_of_text_token_id).nonzero(as_tuple=True)[1]
370
-
371
- first_end_token = min(
372
- image_token_ids[0] if len(image_token_ids) else len(actual_generated_tokens[0]),
373
- end_of_text_token_ids[0] if len(end_of_text_token_ids) else len(actual_generated_tokens[0]),
374
- begin_of_text_token_ids[0] if len(begin_of_text_token_ids) else len(actual_generated_tokens[0]),
375
- )
376
- else:
377
- first_end_token = len(actual_generated_tokens[0])
378
-
379
- actual_generated_tokens = actual_generated_tokens[:, :first_end_token]
380
- displayed_tokens = torch.cat([original_prompt, actual_generated_tokens], dim=-1)
381
- generated_text = tokenizer.batch_decode(displayed_tokens, skip_special_tokens=hide_special_tokens)[0]
382
- return generated_text
383
-
384
-
385
- def model_inference(
386
- files,
387
- prompt,
388
- temperature,
389
- no_repeat_ngram_size,
390
- max_new_tokens,
391
- min_length,
392
- ban_tokens,
393
- forced_eos_token_id,
394
- eos_tokens,
395
- force_words,
396
- length_penalty,
397
- repetition_penalty,
398
- hide_special_tokens,
399
- stop_generation,
400
- decoding_strategy,
401
- num_beams,
402
- top_k,
403
- top_p,
404
- penalty_alpha,
405
- ):
406
- if isinstance(files, str) and len(files) == 0:
407
- files = None
408
-
409
- prompt = prompt.strip()
410
- prompt = prompt.replace("\\n", "\n")
411
- file_idx = 0
412
- url_images = re.findall(r"<image(.*?)>", prompt)
413
- for idx, url_image in enumerate(url_images):
414
- if len(url_image) == 0:
415
- url_images[idx] = Image.open(files[file_idx].name if hasattr(files[file_idx], "name") else files[file_idx])
416
- file_idx += 1
417
- else:
418
- prompt = prompt.replace(url_image, "")
419
- url_images[idx] = url_images[idx][1:]
420
- images = fetch_images(url_images)
421
-
422
- global model, tokenizer
423
-
424
- generated_text = model_generation(
425
- prompt=prompt,
426
- images=images,
427
- tokenizer=tokenizer,
428
- model=model,
429
- temperature=temperature,
430
- no_repeat_ngram_size=no_repeat_ngram_size,
431
- max_new_tokens=max_new_tokens,
432
- min_length=min_length,
433
- ban_tokens=ban_tokens,
434
- forced_eos_token_id=forced_eos_token_id,
435
- eos_tokens=eos_tokens,
436
- force_words=force_words,
437
- length_penalty=length_penalty,
438
- repetition_penalty=repetition_penalty,
439
- hide_special_tokens=hide_special_tokens,
440
- stop_generation=stop_generation,
441
- decoding_strategy=decoding_strategy,
442
- num_beams=num_beams,
443
- top_k=top_k,
444
- top_p=top_p,
445
- penalty_alpha=penalty_alpha,
446
- )
447
- return generated_text.strip()
448
-
449
-
450
- def try_model_inference(
451
- model,
452
- files,
453
- prompt,
454
- temperature,
455
- no_repeat_ngram_size,
456
- max_new_tokens,
457
- min_length,
458
- ban_tokens,
459
- forced_eos_token_id,
460
- eos_tokens,
461
- force_words,
462
- length_penalty,
463
- repetition_penalty,
464
- hide_special_tokens,
465
- stop_generation,
466
- decoding_strategy,
467
- num_beams,
468
- top_k,
469
- top_p,
470
- penalty_alpha,
471
- ):
472
- count = 0
473
- while count < MAX_TRIES:
474
- try:
475
- return MODEL_TO_SPACE_MAPPING[model](
476
- files,
477
- prompt,
478
- temperature,
479
- no_repeat_ngram_size,
480
- max_new_tokens,
481
- min_length,
482
- ban_tokens,
483
- forced_eos_token_id,
484
- eos_tokens,
485
- force_words,
486
- length_penalty,
487
- repetition_penalty,
488
- hide_special_tokens,
489
- stop_generation,
490
- decoding_strategy,
491
- num_beams,
492
- top_k,
493
- top_p,
494
- penalty_alpha,
495
- api_name="model_inference",
496
- )
497
- except KeyError:
498
- # Gradio return {'error': None} some times.
499
- time.sleep(3)
500
- count += 1
501
- pass
502
-
503
-
504
- def all_model_inference(
505
- prompt,
506
- temperature,
507
- no_repeat_ngram_size,
508
- max_new_tokens,
509
- min_length,
510
- ban_tokens,
511
- forced_eos_token_id,
512
- eos_tokens,
513
- force_words,
514
- length_penalty,
515
- repetition_penalty,
516
- hide_special_tokens,
517
- stop_generation,
518
- decoding_strategy,
519
- num_beams,
520
- top_k,
521
- top_p,
522
- penalty_alpha,
523
- ):
524
- outputs = []
525
- print(
526
- prompt,
527
- temperature,
528
- no_repeat_ngram_size,
529
- max_new_tokens,
530
- min_length,
531
- ban_tokens,
532
- forced_eos_token_id,
533
- eos_tokens,
534
- force_words,
535
- length_penalty,
536
- repetition_penalty,
537
- hide_special_tokens,
538
- stop_generation,
539
- decoding_strategy,
540
- num_beams,
541
- top_k,
542
- top_p,
543
- penalty_alpha,
544
- )
545
- outputs = Parallel(n_jobs=len(MODEL_TO_SPACE_MAPPING), backend="threading")(
546
- delayed(try_model_inference)(
547
- model,
548
- os.path.join(os.path.dirname(__file__), "images", "bear.jpg"),
549
- prompt,
550
- temperature,
551
- no_repeat_ngram_size,
552
- max_new_tokens,
553
- min_length,
554
- ban_tokens,
555
- forced_eos_token_id,
556
- eos_tokens,
557
- force_words,
558
- length_penalty,
559
- repetition_penalty,
560
- hide_special_tokens,
561
- stop_generation,
562
- decoding_strategy,
563
- num_beams,
564
- top_k,
565
- top_p,
566
- penalty_alpha,
567
- )
568
- for model in MODEL_TO_SPACE_MAPPING
569
- )
570
- if len(outputs) == 1:
571
- outputs = outputs[0]
572
- return outputs
573
-
574
-
575
- examples = [
576
- [
577
- None,
578
- """This is a conversation between a human, User, and an intelligent visual AI, Assistant. User sends images, and Assistant answers the questions from the user. The assistant should be friendly, informative and should not change the topic if it's not asked to.
579
-
580
- Here's an example of a conversation:
581
- User:<fake_token_around_image><image:https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg><fake_token_around_image>Describe this image.
582
- Assistant: Two kittens are cuddling in the center of the photograph. They are surrounded by tall, bright green grass, and the background is blurred.
583
- User:<fake_token_around_image><image:https://cdn.pixabay.com/photo/2017/09/25/13/12/puppy-2785074_1280.jpg><fake_token_around_image>How about this image? Can you describe it too?
584
- Assistant: A dog is lying on the floor, looking at the camera. It is looking directly at you, tilting its head to the side. The dog has a white body and brown patches on its face and ears. Its eyes are brown. Its nose is black, and it has long, floppy ears, short legs, white paws, long fur, big eyes, and black eyebrows.
585
- User: What kind of breed is it?
586
- Assistant: It looks like a Cavalier King Charles Spaniel. These breeds are known for their gentle, affectionate nature. Their long ears and big, brown eyes give them a sweet, expressive face. Cavaliers are small dogs weighing between 13 and 20 pounds. They have a silky, medium-length coat that comes in various colors, including black and tan, tricolor, red and white, or black.
587
- ---
588
- User:<fake_token_around_image><image:https://m.media-amazon.com/images/M/MV5BMjE4MTcwMTM1Nl5BMl5BanBnXkFtZTcwMTIwMzMzMw@@._V1_.jpg><fake_token_around_image>Describe all of the parts of this image.
589
- Assistant:""",
590
- 1.0,
591
- 0,
592
- 256,
593
- 10,
594
- "<image>;<fake_token_around_image>",
595
- False,
596
- "</s>",
597
- "",
598
- 1.0,
599
- 1.0,
600
- False,
601
- True,
602
- "greedy",
603
- 1,
604
- 50,
605
- 0.5,
606
- 0.95,
607
- ],
608
- # [
609
- # None,
610
- # """This is a conversation between a human, User, and an intelligent visual AI, Bot. User sends images, and Bot answer the questions from the user.
611
- # User: <fake_token_around_image><image:https://m.media-amazon.com/images/M/MV5BMjE4MTcwMTM1Nl5BMl5BanBnXkFtZTcwMTIwMzMzMw@@._V1_.jpg><fake_token_around_image>
612
- # Describe this image.
613
- # Bot:""",
614
- # 1,
615
- # 2,
616
- # 64,
617
- # 10,
618
- # "<image>;<fake_token_around_image>;User;user;Bot;bot;Question;question;Answer;answer;\n",
619
- # False,
620
- # False,
621
- # True,
622
- # ],
623
- # [
624
- # None,
625
- # """This is a conversation between a human, User, and an intelligent visual AI, Bot. User sends images, and Bot answer the questions from the user.
626
- # User: <fake_token_around_image><image:https://i.redd.it/hsktcp4nv1g01.jpg><fake_token_around_image>
627
- # Why do people find this image funny?
628
- # Bot:""",
629
- # 1,
630
- # 2,
631
- # 64,
632
- # 10,
633
- # "<image>;<fake_token_around_image>;User;user;Bot;bot;Question;question;Answer;answer;\n",
634
- # False,
635
- # False,
636
- # True,
637
- # ],
638
- # [
639
- # None,
640
- # """This is a conversation between a human, User, and an intelligent visual AI, Bot. User sends images, and Bot answer the questions from the user.
641
- # User: <fake_token_around_image><image:https://pbs.twimg.com/media/FooD7oyakAIU5_Q?format=jpg&name=large><fake_token_around_image>
642
- # Describe what's in this image.
643
- # Bot:""",
644
- # 1,
645
- # 2,
646
- # 64,
647
- # 10,
648
- # "<image>;<fake_token_around_image>;User;user;Bot;bot;Question;question;Answer;answer;\n",
649
- # False,
650
- # False,
651
- # True,
652
- # ],
653
- # [
654
- # None,
655
- # """This is a conversation between a human, User, and an intelligent visual AI, Bot. User sends images, and Bot answer the questions from the user.
656
- # User: <fake_token_around_image><image:https://www.tutorialride.com/images/non-verbal-analogy-questions/non-verbal-analogy-logical-reasoning-1.jpg><fake_token_around_image>
657
- # What's the correct answer? A, B, C or D?
658
- # Bot:""",
659
- # 1,
660
- # 2,
661
- # 64,
662
- # 10,
663
- # "<image>;<fake_token_around_image>;User;user;Bot;bot;Question;question;Answer;answer;\n",
664
- # False,
665
- # False,
666
- # True,
667
- # ],
668
- ]
669
-
670
-
671
- title = """<head><title><h1 align='center'>🔮✍️ Text generation with IDEFICS models 🦙📚</h1></title></head>"""
672
-
673
-
674
- MSG_MAIN = """
675
- # Text generation with Vllama models
676
-
677
- ### Help to write prompts:
678
-
679
- Put the urls to the images inside the image tokens, it will be converted into the real image tokens. Put <fake_token_around_image> before and after each image token WITHOUT space. The texts \\n will be converted into real newline characters. See examples and additional details below.
680
- """
681
- # MSG_DETAILS = """
682
- # ### Additional details
683
- # - if the model was trained with the template 1 (`\\n\\n<image>\\n\\n`), then `<fake_token_around_image>` will be replaced with `\\n\\n`. This is particularly useful if you are comparing the performance of different models trained with different templates.
684
- # - special tokens are not automatically added to the prompt, so add them manually.
685
- # - with the first template `\\n\\n<image>\\n\\n` , the sequence isn't necessary tokenized as `["\\n\\n", "<image>", "\\n\\n"]` to enforce this behavior, you can use the "Integrate image sequence as ids" parameter.
686
- # """
687
- # if ~IS_MAIN_SPACE:
688
- # MSG_DETAILS += (
689
- # "- alternatively, you can upload images and then directly specify them via \<image\> tag in the prompt."
690
- # )
691
-
692
- with gr.Blocks() as demo:
693
- gr.HTML(title)
694
- gr.HTML("""<h3 align='center'>Help to write prompts:🙌</h3><br>
695
- <p>Put the urls to the images inside the image tokens,
696
- it will be converted into the real image tokens.
697
- Put <fake_token_around_image> before and after each
698
- image token WITHOUT space. The texts \\n will be
699
- converted into real newline characters.
700
- See examples and additional details below.""")
701
-
702
- #gr.HTML("<h3 align='center'>Help to write prompts:🙌</h3><br>Put the urls to the images inside the image tokens, it will be converted into the real image tokens. Put <fake_token_around_image> before and after each image token WITHOUT space. The texts \\n will be converted into real newline characters. See examples and additional details below.")
703
- #gr.Markdown(MSG_MAIN)
704
- #with gr.Row():
705
- #with gr.Column():
706
- gr.Markdown("## Input")
707
- with gr.Row():
708
- if not IS_MAIN_SPACE:
709
- images = gr.File(label="Images", file_count="multiple")
710
- prompt = gr.Textbox(label="Prompt", placeholder="Enter the prompt here", lines=5)
711
-
712
- #gr.Markdown("## Common parameters to all decoding strategy")
713
- with gr.Row():
714
- with gr.Accordion("Common parameters to all decoding strategy", open=False, elem_id="common_params"):
715
- temperature = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=1.0, label="Softmax temperature")
716
- no_repeat_ngram_size = gr.Slider(
717
- minimum=0,
718
- maximum=10,
719
- step=1,
720
- value=0,
721
- label="The size of an n-gram that cannot occur more than once (0=infinity)",
722
- )
723
- max_new_tokens = gr.Slider(
724
- minimum=0, maximum=512, step=1, value=256, label="Maximum number of new tokens to generate"
725
- )
726
- min_length = gr.Slider(
727
- minimum=0, maximum=512, step=1, value=16, label="Minimum length of the sequence to be generated"
728
- )
729
- ban_tokens = gr.Textbox(
730
- label='Tokens to prevent from being generated (separated by ";")',
731
- value="<image>;<fake_token_around_image>",
732
- )
733
- forced_eos_token_id = gr.Checkbox(label="Forced eos token id", value=False)
734
- eos_tokens = gr.Textbox(label="EOS tokens", value="</s>")
735
- force_words = gr.Textbox(label='Force words to be generated (separated by ";")', value="")
736
- length_penalty = gr.Slider(
737
- minimum=-1000,
738
- maximum=1000,
739
- step=0.1,
740
- value=1,
741
- label=(
742
- "length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter"
743
- " sequences."
744
- ),
745
- )
746
- repetition_penalty = gr.Slider(
747
- minimum=0, maximum=10, step=0.01, value=1, label="repetition_penalty. CTRL paper suggests 1.2."
748
- )
749
- hide_special_tokens = gr.Checkbox(label="Hide special tokens in the text", value=False)
750
- stop_generation = gr.Checkbox(
751
- label="Stop generation when an image token, a bos or a eos token is generated", value=False
752
- )
753
-
754
- #gr.Markdown("## Decoding strategy and its specific parameters")
755
- with gr.Accordion("Decoding strategy and its specific parameters", open=False, elem_id="decoding_params"):
756
- decoding_strategy = gr.Dropdown(
757
- ["greedy", "beam_search", "beam_sampling", "sampling_top_k", "sampling_top_p", "contrastive_sampling"],
758
- label="Decoding strategy",
759
- value="greedy",
760
- )
761
- num_beams = gr.Slider(
762
- minimum=0,
763
- maximum=10,
764
- step=1,
765
- value=3,
766
- label="Beam size",
767
- info="Only used if `decoding_strategy` is `beam_search` or `beam_sampling`",
768
- )
769
- top_k = gr.Slider(
770
- minimum=0,
771
- maximum=500,
772
- step=1,
773
- value=50,
774
- label="Top k",
775
- info="Only used if `decoding_strategy` is `sampling_top_k` or `contrastive_sampling`",
776
- )
777
- top_p = gr.Slider(
778
- minimum=0,
779
- maximum=1,
780
- step=0.01,
781
- value=0.95,
782
- label="Top p",
783
- info="Only used if `decoding_strategy` is `sampling_top_p`",
784
- )
785
- penalty_alpha = gr.Slider(
786
- minimum=0,
787
- maximum=1,
788
- step=0.01,
789
- value=0.95,
790
- label="Penalty alpha",
791
- info="Only used if `decoding_strategy` is `contrastive_sampling`",
792
- )
793
-
794
- submit = gr.Button(label="Generate")
795
-
796
- #with gr.Column():
797
- with gr.Row():
798
- if IS_MAIN_SPACE:
799
- outputs = [
800
- gr.Textbox(label=MODEL_TO_DISPLAY_NAME[model], multiline=True, readonly=True)
801
- for model in MODEL_TO_MODEL_CLASS
802
- ]
803
- inference_func = all_model_inference
804
- inputs = [
805
- prompt,
806
- temperature,
807
- no_repeat_ngram_size,
808
- max_new_tokens,
809
- min_length,
810
- ban_tokens,
811
- forced_eos_token_id,
812
- eos_tokens,
813
- force_words,
814
- length_penalty,
815
- repetition_penalty,
816
- hide_special_tokens,
817
- stop_generation,
818
- decoding_strategy,
819
- num_beams,
820
- top_k,
821
- top_p,
822
- penalty_alpha,
823
- ]
824
-
825
- # examples = [example[1:] for example in examples]
826
- else:
827
- outputs = gr.Textbox(label="Generated text", interactive=False, lines=5)
828
- inference_func = model_inference
829
- inputs = [
830
- images,
831
- prompt,
832
- temperature,
833
- no_repeat_ngram_size,
834
- max_new_tokens,
835
- min_length,
836
- ban_tokens,
837
- forced_eos_token_id,
838
- eos_tokens,
839
- force_words,
840
- length_penalty,
841
- repetition_penalty,
842
- hide_special_tokens,
843
- stop_generation,
844
- decoding_strategy,
845
- num_beams,
846
- top_k,
847
- top_p,
848
- penalty_alpha,
849
- ]
850
- with gr.Row():
851
- gr.Examples(inputs=inputs, examples=examples)
852
- # gr.Markdown(MSG_DETAILS)
853
-
854
- submit.click(inference_func, inputs=inputs, outputs=outputs, api_name="model_inference")
855
-
856
- demo.queue()
857
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app_dialogue.py CHANGED
@@ -1,37 +1,52 @@
 
 
1
  import os
 
 
 
 
 
2
  import gradio as gr
 
 
 
 
3
 
4
 
5
- models = [
6
  "HuggingFaceM4/idefics-9b-instruct",
7
  # "HuggingFaceM4/idefics-80b-instruct",
8
  ]
9
 
10
- SYSTEM_PROMPT = """The following is a conversation between a highly knowledgeable and intelligent visual AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant has the ability to perceive images and reason about the content of visual inputs. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.
11
-
12
- The conversation begins:
13
- User:<fake_token_around_image><image:https://i1.pickpik.com/photos/515/906/355/kittens-cats-pet-cute-preview.jpg><fake_token_around_image>Describe this image.</s>
14
- Assistant: Five kittens are standing together in the center of the photograph. From the left to right, there is one orange kitten, two kittens white and black stripes, and two brown kittens with an orange head. They are in bright green grass and it looks like they are moving forward.</s>
15
- User:How about this image?<fake_token_around_image><image:https://cdn.pixabay.com/photo/2017/09/25/13/12/puppy-2785074_1280.jpg><fake_token_around_image>Can you describe it too?</s>
16
- Assistant: A dog is lying on the floor, looking at the camera. It is looking directly at you. The dog has a white body and brown patches on its face and ears. Its eyes are dark. Its nose is black, and it has long, floppy ears, white paws, long fur, big eyes.</s>
17
- User: What kind of breed is it?</s>
18
- Assistant: It looks like a Cavalier King Charles Spaniel. These breeds are known for their gentle, affectionate nature. Their long ears and big, brown eyes give them a sweet, expressive face. Cavaliers are small dogs weighing between 13 and 20 pounds. They have a silky, medium-length coat that comes in various colors, including black and tan, tricolor, red and white, or black.</s>
19
- User: What can you tell me about this breed of dogs?</s>
20
- Assistant: One specific characteristic of the Cavalier King Charles Spaniel is its friendly and affectionate nature. This breed is known for being extremely sociable and forming strong bonds with their owners. They are often described as "velcro dogs" because they love to be close to their human companions, whether it's sitting on their laps, cuddling on the couch, or simply following them around the house.</s>
21
- User: ghjkhjabnufs</s>
22
- Assistant: That doesn’t seem to be a word. Could you ask me another way?</s>
23
- User: Do you like Cavalier King Charles Spaniel?</s>
24
- Assistant: I do not have personal opinions as I’m just a computer program. However, cavaliers are known for being excellent family pets due to their gentle and patient demeanor, making them great with children and other pets. Their affectionate nature also makes them ideal therapy dogs, as they can provide comfort and emotional support to those in need.</s>
25
- User: How many dogs do you see in this image?<fake_token_around_image><image:https://i.dailymail.co.uk/i/pix/2011/07/01/article-2010308-0CD22A8300000578-496_634x414.jpg><fake_token_around_image></s>
26
- Assistant: There is no dogs in this image. The picture shows a tennis player jumping to volley the ball.</s>"""
27
-
28
- BAN_TOKENS = "<image>;<fake_token_around_image>"
29
- EOS_TOKENS = "</s>;User"
30
 
31
- import logging
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- from accelerate.utils import get_max_memory
34
- from transformers import AutoConfig, IdeficsForVisionText2Text, AutoProcessor
35
 
36
 
37
  TOKENIZER_FAST = True
@@ -41,20 +56,165 @@ logging.basicConfig(level=logging.INFO)
41
  logger = logging.getLogger()
42
 
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  def load_processor_tokenizer_model(model_name):
45
  processor = AutoProcessor.from_pretrained(
46
  model_name,
47
- use_auth_token=os.getenv("HF_AUTH_TOKEN", True),
48
  truncation_side="left",
49
  )
50
  tokenizer = processor.tokenizer
51
- # tokenizer = AutoTokenizer.from_pretrained(
52
- # model_name,
53
- # use_fast=TOKENIZER_FAST,
54
- # use_auth_token=os.getenv("HF_AUTH_TOKEN", True),
55
- # truncation_side="left",
56
- # )
57
- # tokenizer.padding_side = "left" -> we don't need that, do we?
58
 
59
  config = AutoConfig.from_pretrained(model_name, use_auth_token=os.getenv("HF_AUTH_TOKEN", True))
60
  max_memory_map = get_max_memory()
@@ -68,7 +228,7 @@ def load_processor_tokenizer_model(model_name):
68
 
69
  model = IdeficsForVisionText2Text.from_pretrained(
70
  model_name,
71
- use_auth_token=os.getenv("HF_AUTH_TOKEN", True),
72
  device_map="auto",
73
  offload_folder="./offload",
74
  torch_dtype=config.torch_dtype,
@@ -81,20 +241,48 @@ def load_processor_tokenizer_model(model_name):
81
  return processor, tokenizer, model
82
 
83
 
84
- def split_prompt_into_list(prompt_str):
85
- """Convert a full string prompt to the list format expected by the processor."""
86
- prompt_splitted = prompt_str.split("<fake_token_around_image>")
87
- prompt_list = []
88
- for ps in prompt_splitted:
89
- if ps.startswith("<image:"):
90
- prompt_list.append(ps[7:-1])
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  else:
92
- prompt_list.append(ps)
93
- return prompt_list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
 
96
  def model_generation(
97
- prompt,
98
  processor,
99
  tokenizer,
100
  model,
@@ -115,10 +303,9 @@ def model_generation(
115
  penalty_alpha,
116
  ):
117
  input_args = processor(
118
- [split_prompt_into_list(prompt)],
119
- eval_mode=True,
120
  truncation=True,
121
- max_length=MAX_SEQ_LEN - 512, # TODO: replace the 512 value with `max_new_tokens`
122
  padding=True,
123
  )
124
  for k, v in input_args.items():
@@ -214,7 +401,7 @@ def model_generation(
214
 
215
  logger.info(
216
  "Result: \n"
217
- f"----Prompt: `{prompt}`\n"
218
  f"----Tokens ids - prompt + generation: `{generated_tokens[0].tolist()}`\n"
219
  f"----Tokens converted - prompt + generation: `{tokens}`\n"
220
  f"----String decoded with skipped special tokens - prompt + generation: `{decoded_skip_special_tokens}`\n"
@@ -247,24 +434,15 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
247
  with gr.Column(scale=3):
248
  with gr.Row(elem_id="model_selector_row"):
249
  model_selector = gr.Dropdown(
250
- choices=models,
251
- value=models[0] if len(models) > 0 else "",
252
  interactive=True,
253
  show_label=False,
254
  container=False,
255
  )
256
  processor, tokenizer, model = load_processor_tokenizer_model(model_selector.value)
257
 
258
- imagebox = gr.Image(
259
- type="pil",
260
- label=(
261
- "Image input - This image box is not supported yet! To include images, do through the text by"
262
- " adding `<fake_token_around_image><image:IMAGE_URL><fake_token_around_image>`. The backend takes"
263
- " care of parsing that <image:URL> and download the correponding image. That way, you can"
264
- " technically interleave as many images and texts as you want. No need to add space before and"
265
- " after `<fake_token_around_image>`"
266
- ),
267
- )
268
 
269
  with gr.Accordion("Generation parameters", open=False, visible=True) as parameter_row:
270
  max_new_tokens = gr.Slider(
@@ -304,9 +482,9 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
304
  decoding_strategy = gr.Radio(
305
  [
306
  "greedy",
307
- "beam_search",
308
- "beam_sampling",
309
- "sampling_top_k",
310
  "sampling_top_p",
311
  ],
312
  value="greedy",
@@ -404,6 +582,13 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
404
  )
405
 
406
  with gr.Column(scale=6):
 
 
 
 
 
 
 
407
  chatbot = gr.Chatbot(
408
  elem_id="chatbot",
409
  label="Idefics Chatbot",
@@ -412,8 +597,10 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
412
  value=[
413
  [
414
  (
415
- "What's unusual about this"
416
- " image?<fake_token_around_image><image:https://i.redd.it/9nkcvbi1pp9y.jpg><fake_token_around_image>"
 
 
417
  ),
418
  (
419
  "The unusual aspect of this image is that there is a cat lying on a bed with an orange on"
@@ -424,8 +611,10 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
424
  ],
425
  [
426
  (
427
- "<fake_token_around_image><image:https://www.boredpanda.com/blog/wp-content/uploads/2014/02/funny-wet-cats-coverimage.jpg><fake_token_around_image>What"
428
- " about this cat? Why do people find it funny?"
 
 
429
  ),
430
  (
431
  "The cat in the image is a gray and white long-haired cat with a surprised expression on"
@@ -445,32 +634,11 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
445
  with gr.Column(scale=1, min_width=20):
446
  clear_btn = gr.ClearButton([textbox, chatbot])
447
  cur_dir = os.path.dirname(os.path.abspath(__file__))
448
-
449
- #def process_example(message: str) -> tuple[str, list[tuple[str, str]]]:
450
- # chat = model_inference(message, [], "greedy", 3, 1.0, 0, 512, 16, 1.0, 1.0, 50, 0.95, 0.95)
451
- # return '', chat
452
-
453
- #gr.Examples(
454
- # examples=[
455
- # [
456
- # ("How many of these animals can we fit into an engine like that<fake_token_around_image><image:https://upload.wikimedia.org/wikipedia/commons/thumb/4/4e/Nassau_County_Police_Bell_407.jpg/1200px-Nassau_County_Police_Bell_407.jpg><fake_token_around_image>?", "The image shows a helicopter with a large engine, but it is not possible to determine the exact number of animals that can fit into it based on the image alone. The size and capacity of the helicopter's engine would depend on various factors, such as the size of the animals, the weight of the animals, and the size of the helicopter itself. However, it is safe to assume that the helicopter is designed to carry a limited number of animals, and it is not intended to be used as a means of transporting large groups of animals."),
457
- # ],
458
- # ],
459
- # inputs = [chatbot]
460
- # )
461
-
462
-
463
- def format_prompt_with_history_and_system_conditioning(current_user_prompt, history):
464
- resulting_text = SYSTEM_PROMPT
465
- for turn in history:
466
- user_utterance, assistant_utterance = turn
467
- resulting_text += f"\nUser: {user_utterance}</s>\nAssistant: {assistant_utterance}</s>"
468
- resulting_text += f"\nUser: {current_user_prompt}</s>\nAssistant:"
469
- return resulting_text
470
 
471
  def model_inference(
472
- user_prompt,
473
  chat_history,
 
474
  decoding_strategy,
475
  num_beams,
476
  temperature,
@@ -483,28 +651,19 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
483
  top_p,
484
  penalty_alpha,
485
  ):
486
- global processor, model, tokenizer
487
- # temperature = 1.0
488
- # no_repeat_ngram_size = 0
489
- # max_new_tokens = 512
490
- # min_length = 16
491
  force_words = ""
492
- # repetition_penalty = 1.0
493
  hide_special_tokens = False
494
- # decoding_strategy = "greedy"
495
- # num_beams = 3
496
- # length_penalty = 1.0
497
- # top_k = 50
498
- # top_p = 0.95
499
- # penalty_alpha = 0.95
500
-
501
- formated_prompt = format_prompt_with_history_and_system_conditioning(
502
- current_user_prompt=user_prompt.strip(),
503
  history=chat_history,
504
  )
505
 
506
  generated_text = model_generation(
507
- prompt=formated_prompt,
508
  processor=processor,
509
  tokenizer=tokenizer,
510
  model=model,
@@ -525,14 +684,24 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
525
  penalty_alpha=penalty_alpha,
526
  )
527
 
528
- chat_history.append((user_prompt, generated_text.strip("</s>")))
529
- return "", chat_history
 
 
 
 
 
 
 
 
 
530
 
531
  textbox.submit(
532
  fn=model_inference,
533
  inputs=[
534
  textbox,
535
  chatbot,
 
536
  decoding_strategy,
537
  num_beams,
538
  temperature,
@@ -545,13 +714,14 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
545
  top_p,
546
  penalty_alpha,
547
  ],
548
- outputs=[textbox, chatbot],
549
  )
550
  submit_btn.click(
551
  fn=model_inference,
552
  inputs=[
553
  textbox,
554
  chatbot,
 
555
  decoding_strategy,
556
  num_beams,
557
  temperature,
@@ -566,20 +736,10 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
566
  ],
567
  outputs=[
568
  textbox,
 
569
  chatbot,
570
  ],
571
  )
572
 
573
- #gr.Examples(examples=[
574
- # [f"{cur_dir}/examples/extreme_ironing.jpg", "What is unusual about this image?"],
575
- # [f"{cur_dir}/examples/waterview.jpg", "What are the things I should be cautious about when I visit here?"],
576
- # ["m4-dialogue/images/bear.jpg", "Describe this image"],
577
- # ],
578
- # inputs=[textbox],
579
- # outputs=[textbox, chatbot],
580
- # fn=process_example,
581
- # cache_examples=True,
582
- #)
583
-
584
  demo.queue()
585
  demo.launch()
 
1
+ import base64
2
+ import logging
3
  import os
4
+ import re
5
+ from io import BytesIO
6
+ from typing import List, Optional, Tuple, Union
7
+ from urllib.parse import urlparse
8
+
9
  import gradio as gr
10
+ import PIL
11
+ from accelerate.utils import get_max_memory
12
+ from PIL import Image
13
+ from transformers import AutoConfig, AutoProcessor, IdeficsForVisionText2Text
14
 
15
 
16
+ MODELS = [
17
  "HuggingFaceM4/idefics-9b-instruct",
18
  # "HuggingFaceM4/idefics-80b-instruct",
19
  ]
20
 
21
+ SYSTEM_PROMPT = [
22
+ """The following is a conversation between a highly knowledgeable and intelligent visual AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant has the ability to perceive images and reason about the content of visual inputs. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ The conversation begins:\n""",
25
+ """User:<fake_token_around_image><image:https://i1.pickpik.com/photos/515/906/355/kittens-cats-pet-cute-preview.jpg><fake_token_around_image>Describe this image.<end_of_utterance>\n""",
26
+ """Assistant: Five kittens are standing together in the center of the photograph. From the left to right, there is one orange kitten, two kittens white and black stripes, and two brown kittens with an orange head. They are in bright green grass and it looks like they are moving forward.<end_of_utterance>\n""",
27
+ (
28
+ "User:How about this"
29
+ " image?<fake_token_around_image><image:https://cdn.pixabay.com/photo/2017/09/25/13/12/puppy-2785074_1280.jpg><fake_token_around_image>Can"
30
+ " you describe it too?<end_of_utterance>\n"
31
+ ),
32
+ """Assistant: A dog is lying on the floor, looking at the camera. It is looking directly at you. The dog has a white body and brown patches on its face and ears. Its eyes are dark. Its nose is black, and it has long, floppy ears, white paws, long fur, big eyes.<end_of_utterance>\n""",
33
+ "User: What kind of breed is it?<end_of_utterance>\n",
34
+ """Assistant: It looks like a Cavalier King Charles Spaniel. These breeds are known for their gentle, affectionate nature. Their long ears and big, brown eyes give them a sweet, expressive face. Cavaliers are small dogs weighing between 13 and 20 pounds. They have a silky, medium-length coat that comes in various colors, including black and tan, tricolor, red and white, or black.<end_of_utterance>\n""",
35
+ "User: What can you tell me about this breed of dogs?<end_of_utterance>\n",
36
+ """Assistant: One specific characteristic of the Cavalier King Charles Spaniel is its friendly and affectionate nature. This breed is known for being extremely sociable and forming strong bonds with their owners. They are often described as "velcro dogs" because they love to be close to their human companions, whether it's sitting on their laps, cuddling on the couch, or simply following them around the house.<end_of_utterance>\n""",
37
+ "User: ghjkhjabnufs<end_of_utterance>\n",
38
+ """Assistant: That doesn’t seem to be a word. Could you ask me another way?<end_of_utterance>\n""",
39
+ "User: Do you like Cavalier King Charles Spaniel?<end_of_utterance>\n",
40
+ """Assistant: I do not have personal opinions as I’m just a computer program. However, cavaliers are known for being excellent family pets due to their gentle and patient demeanor, making them great with children and other pets. Their affectionate nature also makes them ideal therapy dogs, as they can provide comfort and emotional support to those in need.<end_of_utterance>\n""",
41
+ (
42
+ "User: How many dogs do you see in this"
43
+ " image?<fake_token_around_image><image:https://i.dailymail.co.uk/i/pix/2011/07/01/article-2010308-0CD22A8300000578-496_634x414.jpg><fake_token_around_image><end_of_utterance>\n"
44
+ ),
45
+ """Assistant: There is no dogs in this image. The picture shows a tennis player jumping to volley the ball.<end_of_utterance>\n""",
46
+ ]
47
 
48
+ BAN_TOKENS = "<image>;<fake_token_around_image>"
49
+ EOS_TOKENS = "</s>;<end_of_utterance>;User"
50
 
51
 
52
  TOKENIZER_FAST = True
 
56
  logger = logging.getLogger()
57
 
58
 
59
+ # Conversion between PIL Image <-> base64 <-> Markdown utils
60
+ def pil_to_base64(pil_image):
61
+ """
62
+ Convert an PIL image into base64 string representation
63
+ """
64
+ buffered = BytesIO()
65
+ pil_image.save(buffered, format="JPEG") # You can change the format as per your image type
66
+ encoded_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
67
+ return encoded_image
68
+
69
+
70
+ def pil_to_markdown_im(image):
71
+ """
72
+ Convert a PIL image into markdown filled with the base64 string representation.
73
+ """
74
+ img_b64_str = pil_to_base64(image)
75
+ img_str = f'<img src="data:image/png;base64,{img_b64_str}" />'
76
+ return img_str
77
+
78
+
79
+ def base64_to_pil(encoded_image):
80
+ decoded_image = base64.b64decode(encoded_image)
81
+ pil_image = Image.open(BytesIO(decoded_image))
82
+ return pil_image
83
+
84
+
85
+ def im_markdown_to_pil(im_markdown_str):
86
+ pattern = r'<img src="data:image/png;base64,([^"]+)" />'
87
+ match = re.search(pattern, im_markdown_str)
88
+ img_b64_str = match.group(1)
89
+ return base64_to_pil(img_b64_str)
90
+
91
+
92
+ def split_str_on_im_markdown(string_with_potential_im_markdown):
93
+ """
94
+ Extract from a string (typically the user prompt string) the potentional images saved as a base64 representation
95
+ inside a markdown.
96
+ """
97
+ pattern = r'<img src="data:image/png;base64,([^"]+)" />'
98
+ parts = re.split(pattern, string_with_potential_im_markdown)
99
+ result = []
100
+
101
+ for i, part in enumerate(parts):
102
+ if i % 2 == 0:
103
+ result.append(part)
104
+ else:
105
+ img_tag = f'<img src="data:image/png;base64,{part.strip()}" />'
106
+ result.append(img_tag)
107
+
108
+ return result
109
+
110
+
111
+ # Fetching utils
112
+ def is_url(string):
113
+ """
114
+ Checks if the passed string contains a valid url and nothing else. e.g. if space is included it's immediately
115
+ invalidated the url
116
+ """
117
+ if " " in string:
118
+ return False
119
+ result = urlparse(string)
120
+ return all([result.scheme, result.netloc])
121
+
122
+
123
+ def isolate_images_urls(prompt_list):
124
+ """
125
+ Convert a full string prompt to the list format expected by the processor.
126
+ In particular, image urls (as delimited by <fake_token_around_image>) should be their own elements.
127
+ From:
128
+ ```
129
+ [
130
+ "bonjour<fake_token_around_image><image:IMG_URL><fake_token_around_image>hello",
131
+ PIL.Image.Image,
132
+ "Aurevoir",
133
+ ]
134
+ ```
135
+ to:
136
+ ```
137
+ [
138
+ "bonjour",
139
+ IMG_URL,
140
+ "hello",
141
+ PIL.Image.Image,
142
+ "Aurevoir",
143
+ ]
144
+ ```
145
+ """
146
+ linearized_list = []
147
+ for prompt in prompt_list:
148
+ # Prompt can be either a string, or a PIL image
149
+ if isinstance(prompt, PIL.Image.Image):
150
+ linearized_list.append(prompt)
151
+ elif isinstance(prompt, str):
152
+ if "<fake_token_around_image>" not in prompt:
153
+ linearized_list.append(prompt)
154
+ else:
155
+ prompt_splitted = prompt.split("<fake_token_around_image>")
156
+ for ps in prompt_splitted:
157
+ if ps == "":
158
+ continue
159
+ if ps.startswith("<image:"):
160
+ linearized_list.append(ps[7:-1])
161
+ else:
162
+ linearized_list.append(ps)
163
+ else:
164
+ raise TypeError(
165
+ f"Unrecognized type for `prompt`. Got {type(type(prompt))}. Was expecting something in [`str`,"
166
+ " `PIL.Image.Image`]"
167
+ )
168
+ return linearized_list
169
+
170
+
171
+ # Chatbot handling utils
172
+ def handle_manual_images_in_user_prompt(user_prompt: str) -> List[Union[str, PIL.Image.Image]]:
173
+ """
174
+ Handle the case of textually manually inputted images (i.e. the `<fake_token_around_image><image:IMG_URL><fake_token_around_image>`) in the user prompt
175
+ by fetching them and replacing the whole sub-sequence by a PIL image.
176
+ """
177
+ if "<fake_token_around_image>" in user_prompt:
178
+ splitted_user_prompt = isolate_images_urls([user_prompt])
179
+ resulting_user_prompt = []
180
+ for up in splitted_user_prompt:
181
+ if is_url(up):
182
+ img = processor.image_processor.fetch_images([up])[0]
183
+ resulting_user_prompt.append(img)
184
+ else:
185
+ resulting_user_prompt.append(up)
186
+ return resulting_user_prompt
187
+ else:
188
+ return [user_prompt]
189
+
190
+
191
+ def user_prompt_list_to_markdown(user_prompt_list: List[Union[str, PIL.Image.Image]]):
192
+ """
193
+ Convert a user prompt in the list format (i.e. elements are either a PIL image or a string) into
194
+ the markdown format that is used for the chatbot history and rendering.
195
+ """
196
+ resulting_string = ""
197
+ for elem in user_prompt_list:
198
+ if isinstance(elem, str):
199
+ resulting_string += elem
200
+ elif isinstance(elem, PIL.Image.Image):
201
+ resulting_string += pil_to_markdown_im(elem)
202
+ else:
203
+ raise ValueError(
204
+ "Unknown type for `user_prompt_list`. Expected an element of type `str` or `PIL.Image.Image` and got"
205
+ f" `{type(elem)}`"
206
+ )
207
+ return resulting_string
208
+
209
+
210
+ # Model and generation utils
211
  def load_processor_tokenizer_model(model_name):
212
  processor = AutoProcessor.from_pretrained(
213
  model_name,
214
+ token=os.getenv("HF_AUTH_TOKEN", True),
215
  truncation_side="left",
216
  )
217
  tokenizer = processor.tokenizer
 
 
 
 
 
 
 
218
 
219
  config = AutoConfig.from_pretrained(model_name, use_auth_token=os.getenv("HF_AUTH_TOKEN", True))
220
  max_memory_map = get_max_memory()
 
228
 
229
  model = IdeficsForVisionText2Text.from_pretrained(
230
  model_name,
231
+ token=os.getenv("HF_AUTH_TOKEN", True),
232
  device_map="auto",
233
  offload_folder="./offload",
234
  torch_dtype=config.torch_dtype,
 
241
  return processor, tokenizer, model
242
 
243
 
244
+ def format_user_prompt_with_im_history_and_system_conditioning(
245
+ current_user_prompt_str: str, current_image: Optional[PIL.Image.Image], history: List[Tuple[str, str]]
246
+ ) -> List[Union[str, PIL.Image.Image]]:
247
+ """
248
+ Produces the resulting list that needs to go inside the processor.
249
+ It handles the potential image box input, the history and the system conditionning.
250
+ """
251
+ resulting_list = SYSTEM_PROMPT
252
+
253
+ # Format history
254
+ for turn in history:
255
+ user_utterance, assistant_utterance = turn
256
+ splitted_user_utterance = split_str_on_im_markdown(user_utterance)
257
+ splitted_user_utterance = [
258
+ im_markdown_to_pil(s) if s.startswith('<img src="data:image/png;base64,') else s
259
+ for s in splitted_user_utterance
260
+ if s != ""
261
+ ]
262
+ if isinstance(splitted_user_utterance[0], str):
263
+ resulting_list.append("\nUser: ")
264
  else:
265
+ resulting_list.append("\nUser:")
266
+ resulting_list.extend(splitted_user_utterance)
267
+ resulting_list.append(f"<end_of_utterance>\nAssistant: {assistant_utterance}")
268
+
269
+ # Format current input
270
+ if current_image is None:
271
+ if "<img src=data:image/png;base64" in current_user_prompt_str:
272
+ raise ValueError("The UI does not support inputing via the text box an image in base64.")
273
+ current_user_prompt_list = handle_manual_images_in_user_prompt(current_user_prompt_str)
274
+ resulting_list.append("\nUser: ")
275
+ resulting_list.extend(current_user_prompt_list)
276
+ resulting_list.append("<end_of_utterance>\nAssistant:")
277
+ return resulting_list, current_user_prompt_list
278
+ else:
279
+ # Choosing to put the image first when the image is inputted through the UI, but this is an arbiratrary choice.
280
+ resulting_list.extend(["\nUser:", current_image, f"{current_user_prompt_str}<end_of_utterance>\nAssistant:"])
281
+ return resulting_list, [current_user_prompt_str]
282
 
283
 
284
  def model_generation(
285
+ prompt_list,
286
  processor,
287
  tokenizer,
288
  model,
 
303
  penalty_alpha,
304
  ):
305
  input_args = processor(
306
+ isolate_images_urls(prompt_list),
 
307
  truncation=True,
308
+ max_length=MAX_SEQ_LEN - max_new_tokens,
309
  padding=True,
310
  )
311
  for k, v in input_args.items():
 
401
 
402
  logger.info(
403
  "Result: \n"
404
+ f"----Prompt: `{prompt_list}`\n"
405
  f"----Tokens ids - prompt + generation: `{generated_tokens[0].tolist()}`\n"
406
  f"----Tokens converted - prompt + generation: `{tokens}`\n"
407
  f"----String decoded with skipped special tokens - prompt + generation: `{decoded_skip_special_tokens}`\n"
 
434
  with gr.Column(scale=3):
435
  with gr.Row(elem_id="model_selector_row"):
436
  model_selector = gr.Dropdown(
437
+ choices=MODELS,
438
+ value=MODELS[0] if len(MODELS) > 0 else "",
439
  interactive=True,
440
  show_label=False,
441
  container=False,
442
  )
443
  processor, tokenizer, model = load_processor_tokenizer_model(model_selector.value)
444
 
445
+ imagebox = gr.Image(type="pil", label="Image input")
 
 
 
 
 
 
 
 
 
446
 
447
  with gr.Accordion("Generation parameters", open=False, visible=True) as parameter_row:
448
  max_new_tokens = gr.Slider(
 
482
  decoding_strategy = gr.Radio(
483
  [
484
  "greedy",
485
+ # "beam_search",
486
+ # "beam_sampling",
487
+ # "sampling_top_k",
488
  "sampling_top_p",
489
  ],
490
  value="greedy",
 
582
  )
583
 
584
  with gr.Column(scale=6):
585
+
586
+ def prefetch_images_in_history(user_prompt_str):
587
+ """
588
+ Pre-fetch the images that are passed in the chatbot default history.
589
+ """
590
+ return user_prompt_list_to_markdown(handle_manual_images_in_user_prompt(user_prompt_str))
591
+
592
  chatbot = gr.Chatbot(
593
  elem_id="chatbot",
594
  label="Idefics Chatbot",
 
597
  value=[
598
  [
599
  (
600
+ prefetch_images_in_history(
601
+ "What's unusual about this"
602
+ " image?<fake_token_around_image><image:https://i.redd.it/9nkcvbi1pp9y.jpg><fake_token_around_image>"
603
+ )
604
  ),
605
  (
606
  "The unusual aspect of this image is that there is a cat lying on a bed with an orange on"
 
611
  ],
612
  [
613
  (
614
+ prefetch_images_in_history(
615
+ "<fake_token_around_image><image:https://www.boredpanda.com/blog/wp-content/uploads/2014/02/funny-wet-cats-coverimage.jpg><fake_token_around_image>What"
616
+ " about this cat? Why do people find it funny?"
617
+ )
618
  ),
619
  (
620
  "The cat in the image is a gray and white long-haired cat with a surprised expression on"
 
634
  with gr.Column(scale=1, min_width=20):
635
  clear_btn = gr.ClearButton([textbox, chatbot])
636
  cur_dir = os.path.dirname(os.path.abspath(__file__))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
637
 
638
  def model_inference(
639
+ user_prompt_str,
640
  chat_history,
641
+ image,
642
  decoding_strategy,
643
  num_beams,
644
  temperature,
 
651
  top_p,
652
  penalty_alpha,
653
  ):
654
+ # global processor, model, tokenizer
655
+
 
 
 
656
  force_words = ""
 
657
  hide_special_tokens = False
658
+
659
+ formated_prompt_list, user_prompt_list = format_user_prompt_with_im_history_and_system_conditioning(
660
+ current_user_prompt_str=user_prompt_str.strip(),
661
+ current_image=image,
 
 
 
 
 
662
  history=chat_history,
663
  )
664
 
665
  generated_text = model_generation(
666
+ prompt_list=formated_prompt_list,
667
  processor=processor,
668
  tokenizer=tokenizer,
669
  model=model,
 
684
  penalty_alpha=penalty_alpha,
685
  )
686
 
687
+ if image is None:
688
+ # Case where there is no image OR the image is passed as `<fake_token_around_image><image:IMAGE_URL><fake_token_around_image>`
689
+ chat_history.append((user_prompt_list_to_markdown(user_prompt_list), generated_text.strip("<end_of_utterance>")))
690
+ else:
691
+ # Case where the image is passed through the Image Box.
692
+ # Convert the image into base64 for both passing it through the chat history and
693
+ # displaying the image inside the same bubble as the text.
694
+ chat_history.append(
695
+ (f"{user_prompt_list_to_markdown([image] + user_prompt_list)}", generated_text.strip("<end_of_utterance>"))
696
+ )
697
+ return "", None, chat_history
698
 
699
  textbox.submit(
700
  fn=model_inference,
701
  inputs=[
702
  textbox,
703
  chatbot,
704
+ imagebox,
705
  decoding_strategy,
706
  num_beams,
707
  temperature,
 
714
  top_p,
715
  penalty_alpha,
716
  ],
717
+ outputs=[textbox, imagebox, chatbot],
718
  )
719
  submit_btn.click(
720
  fn=model_inference,
721
  inputs=[
722
  textbox,
723
  chatbot,
724
+ imagebox,
725
  decoding_strategy,
726
  num_beams,
727
  temperature,
 
736
  ],
737
  outputs=[
738
  textbox,
739
+ imagebox,
740
  chatbot,
741
  ],
742
  )
743
 
 
 
 
 
 
 
 
 
 
 
 
744
  demo.queue()
745
  demo.launch()
elon_musk.md DELETED
@@ -1,23 +0,0 @@
1
- This is a conversation between a human, User, and an intelligent visual AI, Bot. User sends images, and Bot answers the questions from the user. The bot should be friendly, informative and should not change the topic if it's not asked to.
2
-
3
- Here's an example of a conversation:
4
- User: <fake_token_around_image> <image> <fake_token_around_image> Describe this image.
5
- Bot: Two kittens are cuddling in the center of the photograph. They are surrounded by tall, bright green grass, and the background is blurred.
6
- User: <fake_token_around_image> <image> <fake_token_around_image> How about this image? Can you describe it too?
7
- Bot: A dog is lying on the floor, looking at the camera. It is looking directly at you, tilting its head to the side. The dog has a white body and brown patches on its face and ears. Its eyes are brown. Its nose is black, and it has long, floppy ears, short legs, white paws, long fur, big eyes, and black eyebrows.
8
- User: What kind of breed is it?
9
- Bot: It looks like a Cavalier King Charles Spaniel. These breeds are known for their gentle, affectionate nature. Their long ears and big, brown eyes give them a sweet, expressive face. Cavaliers are small dogs weighing between 13 and 20 pounds. They have a silky, medium-length coat that comes in various colors, including black and tan, tricolor, red and white, or black.
10
- ---
11
- User: <fake_token_around_image> <image> <fake_token_around_image>
12
- Describe this image.
13
- Bot: This image shows a robot named Wall-E. He is standing in front of a pile of garbage, holding a plant in his hands. His eyes and mouth are open, as if he is surprised. Wall E is made of metal and has two arms and two legs. There is also a small light on his head. In the distance, there are some buildings and a large mountain. This scene takes place on Earth, which is covered in trash.
14
- User: What happens in the film? Explain in a few sentences.
15
- Bot: WallE is an animated film about a lonely robot who falls in love with another robot. When they meet, they discover that they have been programmed to clean up the Earth. Together, the robots work to restore the planet to its former glory. Along the way, Wall e learns about friendship, love, responsibility, sacrifice, hope, courage, determination, perseverance, teamwork, loyalty, honesty, trust, forgiveness, compassion, empathy, selflessness, kindness and generosity. Ultimately, he realizes that he can make a difference in this world.
16
- User: In which year was the film released?
17
- Bot: The film was released in 3D in June 08, 4 years ago.
18
- User: Is a future in which the earth is not liveable anymore (similarly as in the movie) conceivable?
19
- Bot: Yes, it is possible. If we continue to pollute the environment and destroy the natural resources, we will eventually make the world uninhabitable. We need to take action now to protect the future of our planet. One way to do this is to reduce our carbon footprint by using less energy and recycling more. Another way is by supporting companies that are working to develop sustainable technologies. Finally, you can also get involved in local environmental initiatives, such as planting trees or cleaning up parks. By taking these steps, each of us can help to make sure that our children and grandchildren will be able to enjoy a healthy and beautiful planet for many years to come.
20
- User: I think this is a lost cause. <fake_token_around_image> <image> <fake_token_around_image> Like Elon Musk, I think we should instead all move to another planet now.
21
- Bot: El ón Mus ́k is the CEO of SpaceX and Tesla Motors. Space X is developing a reusable rocket system that could be used to transport people and cargo to Mars. Mus k believes that colonizing Mars is essential for the survival of humanity, because it would provide a backup plan in case something goes wrong on earth. However, many people are skeptical about the feasibility of such a project. Some argue that it will take too long and cost too much money to establish a colony on Mars, while others say that the risks are too great. Nevertheless, Mus K remains optimistic about his vision for Mars colonization. "I think there is good reason to believe that we can build a self-sustaining city on mars," he said in an interview with Time magazine. “It would be quite an adventure.”
22
- ----
23
- The project is still in development. I'm currently working on improving the accuracy of image recognition and adding more features to it.</s>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hello.py DELETED
@@ -1,73 +0,0 @@
1
- [
2
- """The following is a conversation between a highly knowledgeable and intelligent AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.\n\nThe conversation begins:\nUser:"""
3
- """https://i1.pickpik.com/photos/515/906/355/kittens-cats-pet-cute-preview.jpg""",
4
- """Describe this image.</s>\n""",
5
- """Assistant: Five kittens are standing together in the center of the photograph. From the left to right, there is one orange kitte, two kittens white and black stripes, and two brown kittens with an orange head. They are in bright green grass and it looks like they are moving forward.</s>\n""",
6
- """User:How about this image?""",
7
- "https://cdn.pixabay.com/photo/2017/09/25/13/12/puppy-2785074_1280.jpg""",
8
- """Can you describe it too?</s>\n"""
9
- """Assistant: A dog is lying on the floor, looking at the camera. It is looking directly at you. The dog has a white body and brown patches on its face and ears. Its eyes are dark. Its nose is black, and it has long, floppy ears, white paws, long fur, big eyes.</s>\n""",
10
- """User: What kind of breed is it?</s>\n""",
11
- """Assistant: It looks like a Cavalier King Charles Spaniel. These breeds are known for their gentle, affectionate nature. Their long ears and big, brown eyes give them a sweet, expressive face. Cavaliers are small dogs weighing between 13 and 20 pounds. They have a silky, medium-length coat that comes in various colors, including black and tan, tricolor, red and white, or black.</s>\n""",
12
- """User: What can you tell me about this breed of dogs?</s>\n"""
13
- """Assistant: One specific characteristic of the Cavalier King Charles Spaniel is its friendly and affectionate nature. This breed is known for being extremely sociable and forming strong bonds with their owners. They are often described as "velcro dogs" because they love to be close to their human companions, whether it's sitting on their laps, cuddling on the couch, or simply following them around the house.</s>\n""",
14
- """User: ghjkhjabnufs</s>\n""",
15
- """Assistant: That doesn’t seem to be a word. Could you ask me another way?</s>\n""",
16
- """User: Do you like Cavalier King Charles Spaniel?</s>\n""",
17
- """Assistant: I do not have personal opinions as I’m just a computer program. However, cavaliers are known for being excellent family pets due to their gentle and patient demeanor, making them great with children and other pets. Their affectionate nature also makes them ideal therapy dogs, as they can provide comfort and emotional support to those in need.</s>\n""",
18
- """User: How many dogs do you see in this image?""",
19
- """https://i.dailymail.co.uk/i/pix/2011/07/01/article-2010308-0CD22A8300000578-496_634x414.jpg""",
20
- """</s>\n""",
21
- """Assistant: There is no dogs in this image. The picture shows a tennis player jumping to volley the ball.</s>\n""",
22
- """User: What color are the cat's eyes?</s>\n""",
23
- """Assistant:"""
24
- ]
25
-
26
- SYSTEM_PROMPT = """The following is a conversation between a highly knowledgeable and intelligent AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.
27
-
28
- The conversation begins:
29
- User:<fake_token_around_image><image:https://i1.pickpik.com/photos/515/906/355/kittens-cats-pet-cute-preview.jpg><fake_token_around_image>Describe this image.</s>
30
- Assistant: Five kittens are standing together in the center of the photograph. From the left to right, there is one orange kitte, two kittens white and black stripes, and two brown kittens with an orange head. They are in bright green grass and it looks like they are moving forward.</s>"""
31
-
32
- prompts = [
33
- """The following is a conversation between a highly knowledgeable and intelligent AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.\n\nThe conversation begins:\nUser:""",
34
- """https://i1.pickpik.com/photos/515/906/355/kittens-cats-pet-cute-preview.jpg""",
35
- """Describe this image.</s>\nAssistant: Five kittens are standing together in the center of the photograph. From the left to right, there is one orange kitte, two kittens white and black stripes, and two brown kittens with an orange head. They are in bright green grass and it looks like they are moving forward.</s>"""
36
- ]
37
-
38
- SYSTEM_PROMPT = """HELLO
39
- AUREVOIR
40
- User:<fake_token_around_image><image:AAA><fake_token_around_image><image:BBB><fake_token_around_image>BOJOUR</s>
41
- Assistant: SPECIFIC</s>"""
42
-
43
- def split_prompt_into_list(prompt_str):
44
- prompt_splitted = prompt_str.split("<fake_token_around_image>")
45
- prompt_list = []
46
- for ps in prompt_splitted:
47
- if ps.startswith("<image:"):
48
- prompt_list.append(ps[7:-1])
49
- else:
50
- prompt_list.append(ps)
51
- return prompt_list
52
-
53
- prompts = split_prompt_into_list(formated_prompt)
54
- i = processor(prompts, eval_mode=True)
55
-
56
- from transformers import AutoTokenizer, AutoProcessor
57
- checkpoint = "HuggingFaceM4/idefics-9b-instruct"
58
- tokenizer = AutoTokenizer.from_pretrained(checkpoint)
59
- processor = AutoProcessor.from_pretrained(checkpoint)
60
-
61
- prompts = [
62
- [
63
- "User:",
64
- # "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg",
65
- # "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg",
66
- "Describe this image.\nAssistant: An image of two kittens in grass.\n",
67
- "User:",
68
- # "https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg",
69
- "Describe this image.\nAssistant:",
70
- ],
71
- ]
72
- inputs = processor(prompts, eval_mode=True)
73
- tokenizer.decode(inputs["input_ids"][0])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  --extra-index-url https://download.pytorch.org/whl/cu113
2
  torch
3
- transformers @ git+https://github.com/huggingface/transformers.git@4dbd4d1a428753659d507613849f0eeea7909f29
4
  requests
5
  pillow
6
  torchvision
 
1
  --extra-index-url https://download.pytorch.org/whl/cu113
2
  torch
3
+ transformers @ git+https://github.com/huggingface/transformers@e8817e851aa55c1f8ec8f1585d25425166711583
4
  requests
5
  pillow
6
  torchvision