VictorSanh commited on
Commit
0e145dd
1 Parent(s): cf142d2

Update visualization

Browse files
Files changed (5) hide show
  1. app_bis.py +9 -7
  2. app_dialogue.py +100 -76
  3. elon_musk.md +1 -1
  4. hello.py +73 -0
  5. requirements.txt +1 -1
app_bis.py CHANGED
@@ -1,6 +1,7 @@
1
  import logging
2
  import os
3
  import re
 
4
  import time
5
  from io import BytesIO
6
 
@@ -9,6 +10,7 @@ import requests
9
  import torch
10
  import transformers
11
  from accelerate.utils import get_max_memory
 
12
  from joblib import Parallel, delayed
13
  from PIL import Image
14
  from transformers import AutoTokenizer
@@ -697,17 +699,17 @@ with gr.Blocks() as demo:
697
  converted into real newline characters.
698
  See examples and additional details below.""")
699
 
700
- # gr.HTML("<h3 align='center'>Help to write prompts:🙌</h3><br>Put the urls to the images inside the image tokens, it will be converted into the real image tokens. Put <fake_token_around_image> before and after each image token WITHOUT space. The texts \\n will be converted into real newline characters. See examples and additional details below.")
701
- # gr.Markdown(MSG_MAIN)
702
- # with gr.Row():
703
- # with gr.Column():
704
  gr.Markdown("## Input")
705
  with gr.Row():
706
  if not IS_MAIN_SPACE:
707
  images = gr.File(label="Images", file_count="multiple")
708
  prompt = gr.Textbox(label="Prompt", placeholder="Enter the prompt here", lines=5)
709
 
710
- # gr.Markdown("## Common parameters to all decoding strategy")
711
  with gr.Row():
712
  with gr.Accordion("Common parameters to all decoding strategy", open=False, elem_id="common_params"):
713
  temperature = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=1.0, label="Softmax temperature")
@@ -749,7 +751,7 @@ with gr.Blocks() as demo:
749
  label="Stop generation when an image token, a bos or a eos token is generated", value=False
750
  )
751
 
752
- # gr.Markdown("## Decoding strategy and its specific parameters")
753
  with gr.Accordion("Decoding strategy and its specific parameters", open=False, elem_id="decoding_params"):
754
  decoding_strategy = gr.Dropdown(
755
  ["greedy", "beam_search", "beam_sampling", "sampling_top_k", "sampling_top_p", "contrastive_sampling"],
@@ -791,7 +793,7 @@ with gr.Blocks() as demo:
791
 
792
  submit = gr.Button(label="Generate")
793
 
794
- # with gr.Column():
795
  with gr.Row():
796
  if IS_MAIN_SPACE:
797
  outputs = [
 
1
  import logging
2
  import os
3
  import re
4
+
5
  import time
6
  from io import BytesIO
7
 
 
10
  import torch
11
  import transformers
12
  from accelerate.utils import get_max_memory
13
+
14
  from joblib import Parallel, delayed
15
  from PIL import Image
16
  from transformers import AutoTokenizer
 
699
  converted into real newline characters.
700
  See examples and additional details below.""")
701
 
702
+ #gr.HTML("<h3 align='center'>Help to write prompts:🙌</h3><br>Put the urls to the images inside the image tokens, it will be converted into the real image tokens. Put <fake_token_around_image> before and after each image token WITHOUT space. The texts \\n will be converted into real newline characters. See examples and additional details below.")
703
+ #gr.Markdown(MSG_MAIN)
704
+ #with gr.Row():
705
+ #with gr.Column():
706
  gr.Markdown("## Input")
707
  with gr.Row():
708
  if not IS_MAIN_SPACE:
709
  images = gr.File(label="Images", file_count="multiple")
710
  prompt = gr.Textbox(label="Prompt", placeholder="Enter the prompt here", lines=5)
711
 
712
+ #gr.Markdown("## Common parameters to all decoding strategy")
713
  with gr.Row():
714
  with gr.Accordion("Common parameters to all decoding strategy", open=False, elem_id="common_params"):
715
  temperature = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=1.0, label="Softmax temperature")
 
751
  label="Stop generation when an image token, a bos or a eos token is generated", value=False
752
  )
753
 
754
+ #gr.Markdown("## Decoding strategy and its specific parameters")
755
  with gr.Accordion("Decoding strategy and its specific parameters", open=False, elem_id="decoding_params"):
756
  decoding_strategy = gr.Dropdown(
757
  ["greedy", "beam_search", "beam_sampling", "sampling_top_k", "sampling_top_p", "contrastive_sampling"],
 
793
 
794
  submit = gr.Button(label="Generate")
795
 
796
+ #with gr.Column():
797
  with gr.Row():
798
  if IS_MAIN_SPACE:
799
  outputs = [
app_dialogue.py CHANGED
@@ -32,12 +32,7 @@ EOS_TOKENS = "</s>;User"
32
  import logging
33
 
34
  from accelerate.utils import get_max_memory
35
- from transformers import (
36
- AutoTokenizer,
37
- AutoProcessor,
38
- AutoConfig,
39
- AutoModelForCausalLM,
40
- )
41
 
42
 
43
  TOKENIZER_FAST = True
@@ -61,9 +56,7 @@ def load_processor_tokenizer_model(model_name):
61
  )
62
  # tokenizer.padding_side = "left" -> we don't need that, do we?
63
 
64
- config = AutoConfig.from_pretrained(
65
- model_name, use_auth_token=os.getenv("HF_AUTH_TOKEN", True)
66
- )
67
  max_memory_map = get_max_memory()
68
 
69
  for key in max_memory_map.keys():
@@ -125,8 +118,7 @@ def model_generation(
125
  [split_prompt_into_list(prompt)],
126
  eval_mode=True,
127
  truncation=True,
128
- max_length=MAX_SEQ_LEN
129
- - 512, # TODO: replace the 512 value with `max_new_tokens`
130
  padding=True,
131
  )
132
  for k, v in input_args.items():
@@ -154,9 +146,7 @@ def model_generation(
154
  if len(eos_tokens) > 0:
155
  eos_token_ids = []
156
  for eos_token in eos_tokens:
157
- tokenized_eos_token = tokenizer(
158
- eos_token, add_special_tokens=False
159
- ).input_ids
160
  if len(tokenized_eos_token) > 1:
161
  raise ValueError(
162
  f"eos_tokens should be one token, here {eos_token} is {len(tokenized_eos_token)} tokens:"
@@ -214,17 +204,13 @@ def model_generation(
214
 
215
  tokens = tokenizer.convert_ids_to_tokens(generated_tokens[0])
216
  decoded_skip_special_tokens = repr(
217
- tokenizer.batch_decode(
218
- generated_tokens, skip_special_tokens=hide_special_tokens
219
- )[0]
220
  )
221
 
222
  actual_generated_tokens = generated_tokens[:, input_args["input_ids"].shape[-1] :]
223
  first_end_token = len(actual_generated_tokens[0])
224
  actual_generated_tokens = actual_generated_tokens[:, :first_end_token]
225
- generated_text = tokenizer.batch_decode(
226
- actual_generated_tokens, skip_special_tokens=hide_special_tokens
227
- )[0]
228
 
229
  logger.info(
230
  "Result: \n"
@@ -267,9 +253,7 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
267
  show_label=False,
268
  container=False,
269
  )
270
- processor, tokenizer, model = load_processor_tokenizer_model(
271
- model_selector.value
272
- )
273
 
274
  imagebox = gr.Image(
275
  type="pil",
@@ -282,7 +266,41 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
282
  ),
283
  )
284
 
285
- with gr.Accordion("Parameters", open=False, visible=True) as parameter_row:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  decoding_strategy = gr.Radio(
287
  [
288
  "greedy",
@@ -293,70 +311,81 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
293
  ],
294
  value="greedy",
295
  label="Decoding strategy",
 
296
  )
297
- num_beams = top_k = gr.Slider(
 
 
 
 
 
 
 
 
298
  minimum=0,
299
  maximum=20,
300
  value=3.0,
301
  step=1.0,
302
  interactive=True,
 
303
  label="Number of beams",
 
304
  )
305
- temperature = gr.Slider(
306
- minimum=0.0,
307
- maximum=1.0,
308
- value=0.2,
309
- step=0.1,
310
- interactive=True,
311
- label="Temperature",
312
  )
313
  top_p = gr.Slider(
314
  minimum=0.0,
315
  maximum=1.0,
316
  value=0.95,
317
- step=0.1,
318
  interactive=True,
 
319
  label="Top P",
 
 
 
 
320
  )
321
- top_k = gr.Slider(
322
- minimum=0.0,
323
- maximum=100.0,
324
- value=50.0,
325
- step=1.0,
326
- interactive=True,
327
- label="Top K",
328
  )
329
- max_new_tokens = gr.Slider(
330
  minimum=0,
331
- maximum=2048,
332
- value=512,
333
- step=64,
334
  interactive=True,
335
- label="Max output tokens",
336
- )
337
- repetition_penalty = gr.Slider(
338
- minimum=0.0,
339
- maximum=5.0,
340
- value=1.0,
341
- step=0.1,
342
- interactive=True,
343
- label="Repetition penalty",
344
  )
345
- min_length = gr.Slider(
346
- minimum=0.0,
347
- maximum=50.0,
348
- value=0.0,
349
- step=1.0,
350
- interactive=True,
351
- label="Minimum generation length",
352
  )
353
  length_penalty = gr.Slider(
354
- minimum=0.0,
355
- maximum=5.0,
356
  value=1.0,
357
  step=0.1,
358
  interactive=True,
 
359
  label="Length penalty",
 
 
 
 
 
 
 
 
 
360
  )
361
  penalty_alpha = gr.Slider(
362
  minimum=0.0,
@@ -364,15 +393,14 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
364
  value=0.95,
365
  step=0.05,
366
  interactive=True,
 
367
  label="Penalty alpha",
 
368
  )
369
- no_repeat_ngram_size = gr.Slider(
370
- minimum=0.0,
371
- maximum=10.0,
372
- value=0.0,
373
- step=1.0,
374
- interactive=True,
375
- label="No repeat ngram size",
376
  )
377
 
378
  with gr.Column(scale=6):
@@ -380,7 +408,7 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
380
  elem_id="chatbot",
381
  label="Idefics Chatbot",
382
  visible=True,
383
- height=750,
384
  value=[
385
  [
386
  (
@@ -430,15 +458,11 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
430
  # inputs = [chatbot]
431
  # )
432
 
433
- def format_prompt_with_history_and_system_conditioning(
434
- current_user_prompt, history
435
- ):
436
  resulting_text = SYSTEM_PROMPT
437
  for turn in history:
438
  user_utterance, assistant_utterance = turn
439
- resulting_text += (
440
- f"\nUser: {user_utterance}</s>\nAssistant: {assistant_utterance}</s>"
441
- )
442
  resulting_text += f"\nUser: {current_user_prompt}</s>\nAssistant:"
443
  return resulting_text
444
 
 
32
  import logging
33
 
34
  from accelerate.utils import get_max_memory
35
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor, AutoTokenizer
 
 
 
 
 
36
 
37
 
38
  TOKENIZER_FAST = True
 
56
  )
57
  # tokenizer.padding_side = "left" -> we don't need that, do we?
58
 
59
+ config = AutoConfig.from_pretrained(model_name, use_auth_token=os.getenv("HF_AUTH_TOKEN", True))
 
 
60
  max_memory_map = get_max_memory()
61
 
62
  for key in max_memory_map.keys():
 
118
  [split_prompt_into_list(prompt)],
119
  eval_mode=True,
120
  truncation=True,
121
+ max_length=MAX_SEQ_LEN - 512, # TODO: replace the 512 value with `max_new_tokens`
 
122
  padding=True,
123
  )
124
  for k, v in input_args.items():
 
146
  if len(eos_tokens) > 0:
147
  eos_token_ids = []
148
  for eos_token in eos_tokens:
149
+ tokenized_eos_token = tokenizer(eos_token, add_special_tokens=False).input_ids
 
 
150
  if len(tokenized_eos_token) > 1:
151
  raise ValueError(
152
  f"eos_tokens should be one token, here {eos_token} is {len(tokenized_eos_token)} tokens:"
 
204
 
205
  tokens = tokenizer.convert_ids_to_tokens(generated_tokens[0])
206
  decoded_skip_special_tokens = repr(
207
+ tokenizer.batch_decode(generated_tokens, skip_special_tokens=hide_special_tokens)[0]
 
 
208
  )
209
 
210
  actual_generated_tokens = generated_tokens[:, input_args["input_ids"].shape[-1] :]
211
  first_end_token = len(actual_generated_tokens[0])
212
  actual_generated_tokens = actual_generated_tokens[:, :first_end_token]
213
+ generated_text = tokenizer.batch_decode(actual_generated_tokens, skip_special_tokens=hide_special_tokens)[0]
 
 
214
 
215
  logger.info(
216
  "Result: \n"
 
253
  show_label=False,
254
  container=False,
255
  )
256
+ processor, tokenizer, model = load_processor_tokenizer_model(model_selector.value)
 
 
257
 
258
  imagebox = gr.Image(
259
  type="pil",
 
266
  ),
267
  )
268
 
269
+ with gr.Accordion("Generation parameters", open=False, visible=True) as parameter_row:
270
+ max_new_tokens = gr.Slider(
271
+ minimum=0,
272
+ maximum=2048,
273
+ value=512,
274
+ step=1,
275
+ interactive=True,
276
+ label="Maximum number of new tokens to generate",
277
+ )
278
+ min_length = gr.Slider(
279
+ minimum=0,
280
+ maximum=50,
281
+ value=0,
282
+ step=1,
283
+ interactive=True,
284
+ label="Minimum number of new tokens to generate",
285
+ )
286
+ repetition_penalty = gr.Slider(
287
+ minimum=0.0,
288
+ maximum=5.0,
289
+ value=1.0,
290
+ step=0.1,
291
+ interactive=True,
292
+ label="Repetition penalty",
293
+ info="1.0 means no penalty",
294
+ )
295
+ no_repeat_ngram_size = gr.Slider(
296
+ minimum=0,
297
+ maximum=10,
298
+ value=0,
299
+ step=1,
300
+ interactive=True,
301
+ label="N-gram repetition threshold",
302
+ info="If set to int > 0, all ngrams of that size can only occur once.",
303
+ )
304
  decoding_strategy = gr.Radio(
305
  [
306
  "greedy",
 
311
  ],
312
  value="greedy",
313
  label="Decoding strategy",
314
+ interactive=True,
315
  )
316
+ temperature = gr.Slider(
317
+ minimum=0.0,
318
+ maximum=5.0,
319
+ value=1.0,
320
+ step=0.1,
321
+ interactive=True,
322
+ label="Sampling temperature",
323
+ )
324
+ num_beams = gr.Slider(
325
  minimum=0,
326
  maximum=20,
327
  value=3.0,
328
  step=1.0,
329
  interactive=True,
330
+ visible=False,
331
  label="Number of beams",
332
+ info="Only used if `decoding_strategy` is `beam_search` or `beam_sampling`.",
333
  )
334
+ decoding_strategy.change(
335
+ fn=lambda selection: gr.Slider.update(visible=(selection in ["beam_search", "beam_sampling"])),
336
+ inputs=decoding_strategy,
337
+ outputs=num_beams,
 
 
 
338
  )
339
  top_p = gr.Slider(
340
  minimum=0.0,
341
  maximum=1.0,
342
  value=0.95,
343
+ step=0.01,
344
  interactive=True,
345
+ visible=False,
346
  label="Top P",
347
+ info=(
348
+ "If set to float < 1, only the smallest set of most probable tokens with probabilities that"
349
+ " add up to top_p or higher are kept for generation."
350
+ ),
351
  )
352
+ decoding_strategy.change(
353
+ fn=lambda selection: gr.Slider.update(visible=(selection in ["sampling_top_p"])),
354
+ inputs=decoding_strategy,
355
+ outputs=top_p,
 
 
 
356
  )
357
+ top_k = gr.Slider(
358
  minimum=0,
359
+ maximum=500,
360
+ value=50,
361
+ step=1,
362
  interactive=True,
363
+ visible=False,
364
+ label="Top K",
365
+ info="The number of highest probability vocabulary tokens to keep for top-k-filtering.",
 
 
 
 
 
 
366
  )
367
+ decoding_strategy.change(
368
+ fn=lambda selection: gr.Slider.update(visible=(selection in ["sampling_top_k"])),
369
+ inputs=decoding_strategy,
370
+ outputs=top_k,
 
 
 
371
  )
372
  length_penalty = gr.Slider(
373
+ minimum=-1000.0,
374
+ maximum=1000.0,
375
  value=1.0,
376
  step=0.1,
377
  interactive=True,
378
+ visible=False,
379
  label="Length penalty",
380
+ info=(
381
+ "length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter"
382
+ " sequences. Only used if `decoding_strategy` is `beam_search` or `beam_sampling`."
383
+ ),
384
+ )
385
+ decoding_strategy.change(
386
+ fn=lambda selection: gr.Slider.update(visible=(selection in ["beam_search", "beam_sampling"])),
387
+ inputs=decoding_strategy,
388
+ outputs=length_penalty,
389
  )
390
  penalty_alpha = gr.Slider(
391
  minimum=0.0,
 
393
  value=0.95,
394
  step=0.05,
395
  interactive=True,
396
+ visible=False,
397
  label="Penalty alpha",
398
+ info="Only used if `decoding_strategy` is `contrastive_sampling`.",
399
  )
400
+ decoding_strategy.change(
401
+ fn=lambda selection: gr.Slider.update(visible=(selection in ["contrastive_sampling"])),
402
+ inputs=decoding_strategy,
403
+ outputs=penalty_alpha,
 
 
 
404
  )
405
 
406
  with gr.Column(scale=6):
 
408
  elem_id="chatbot",
409
  label="Idefics Chatbot",
410
  visible=True,
411
+ height=550,
412
  value=[
413
  [
414
  (
 
458
  # inputs = [chatbot]
459
  # )
460
 
461
+ def format_prompt_with_history_and_system_conditioning(current_user_prompt, history):
 
 
462
  resulting_text = SYSTEM_PROMPT
463
  for turn in history:
464
  user_utterance, assistant_utterance = turn
465
+ resulting_text += f"\nUser: {user_utterance}</s>\nAssistant: {assistant_utterance}</s>"
 
 
466
  resulting_text += f"\nUser: {current_user_prompt}</s>\nAssistant:"
467
  return resulting_text
468
 
elon_musk.md CHANGED
@@ -20,4 +20,4 @@ Bot: Yes, it is possible. If we continue to pollute the environment and destroy
20
  User: I think this is a lost cause. <fake_token_around_image> <image> <fake_token_around_image> Like Elon Musk, I think we should instead all move to another planet now.
21
  Bot: El ón Mus ́k is the CEO of SpaceX and Tesla Motors. Space X is developing a reusable rocket system that could be used to transport people and cargo to Mars. Mus k believes that colonizing Mars is essential for the survival of humanity, because it would provide a backup plan in case something goes wrong on earth. However, many people are skeptical about the feasibility of such a project. Some argue that it will take too long and cost too much money to establish a colony on Mars, while others say that the risks are too great. Nevertheless, Mus K remains optimistic about his vision for Mars colonization. "I think there is good reason to believe that we can build a self-sustaining city on mars," he said in an interview with Time magazine. “It would be quite an adventure.”
22
  ----
23
- The project is still in development. I'm currently working on improving the accuracy of image recognition and adding more features to it.</s>
 
20
  User: I think this is a lost cause. <fake_token_around_image> <image> <fake_token_around_image> Like Elon Musk, I think we should instead all move to another planet now.
21
  Bot: El ón Mus ́k is the CEO of SpaceX and Tesla Motors. Space X is developing a reusable rocket system that could be used to transport people and cargo to Mars. Mus k believes that colonizing Mars is essential for the survival of humanity, because it would provide a backup plan in case something goes wrong on earth. However, many people are skeptical about the feasibility of such a project. Some argue that it will take too long and cost too much money to establish a colony on Mars, while others say that the risks are too great. Nevertheless, Mus K remains optimistic about his vision for Mars colonization. "I think there is good reason to believe that we can build a self-sustaining city on mars," he said in an interview with Time magazine. “It would be quite an adventure.”
22
  ----
23
+ The project is still in development. I'm currently working on improving the accuracy of image recognition and adding more features to it.</s>
hello.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ """The following is a conversation between a highly knowledgeable and intelligent AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.\n\nThe conversation begins:\nUser:"""
3
+ """https://i1.pickpik.com/photos/515/906/355/kittens-cats-pet-cute-preview.jpg""",
4
+ """Describe this image.</s>\n""",
5
+ """Assistant: Five kittens are standing together in the center of the photograph. From the left to right, there is one orange kitte, two kittens white and black stripes, and two brown kittens with an orange head. They are in bright green grass and it looks like they are moving forward.</s>\n""",
6
+ """User:How about this image?""",
7
+ "https://cdn.pixabay.com/photo/2017/09/25/13/12/puppy-2785074_1280.jpg""",
8
+ """Can you describe it too?</s>\n"""
9
+ """Assistant: A dog is lying on the floor, looking at the camera. It is looking directly at you. The dog has a white body and brown patches on its face and ears. Its eyes are dark. Its nose is black, and it has long, floppy ears, white paws, long fur, big eyes.</s>\n""",
10
+ """User: What kind of breed is it?</s>\n""",
11
+ """Assistant: It looks like a Cavalier King Charles Spaniel. These breeds are known for their gentle, affectionate nature. Their long ears and big, brown eyes give them a sweet, expressive face. Cavaliers are small dogs weighing between 13 and 20 pounds. They have a silky, medium-length coat that comes in various colors, including black and tan, tricolor, red and white, or black.</s>\n""",
12
+ """User: What can you tell me about this breed of dogs?</s>\n"""
13
+ """Assistant: One specific characteristic of the Cavalier King Charles Spaniel is its friendly and affectionate nature. This breed is known for being extremely sociable and forming strong bonds with their owners. They are often described as "velcro dogs" because they love to be close to their human companions, whether it's sitting on their laps, cuddling on the couch, or simply following them around the house.</s>\n""",
14
+ """User: ghjkhjabnufs</s>\n""",
15
+ """Assistant: That doesn’t seem to be a word. Could you ask me another way?</s>\n""",
16
+ """User: Do you like Cavalier King Charles Spaniel?</s>\n""",
17
+ """Assistant: I do not have personal opinions as I’m just a computer program. However, cavaliers are known for being excellent family pets due to their gentle and patient demeanor, making them great with children and other pets. Their affectionate nature also makes them ideal therapy dogs, as they can provide comfort and emotional support to those in need.</s>\n""",
18
+ """User: How many dogs do you see in this image?""",
19
+ """https://i.dailymail.co.uk/i/pix/2011/07/01/article-2010308-0CD22A8300000578-496_634x414.jpg""",
20
+ """</s>\n""",
21
+ """Assistant: There is no dogs in this image. The picture shows a tennis player jumping to volley the ball.</s>\n""",
22
+ """User: What color are the cat's eyes?</s>\n""",
23
+ """Assistant:"""
24
+ ]
25
+
26
+ SYSTEM_PROMPT = """The following is a conversation between a highly knowledgeable and intelligent AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.
27
+
28
+ The conversation begins:
29
+ User:<fake_token_around_image><image:https://i1.pickpik.com/photos/515/906/355/kittens-cats-pet-cute-preview.jpg><fake_token_around_image>Describe this image.</s>
30
+ Assistant: Five kittens are standing together in the center of the photograph. From the left to right, there is one orange kitte, two kittens white and black stripes, and two brown kittens with an orange head. They are in bright green grass and it looks like they are moving forward.</s>"""
31
+
32
+ prompts = [
33
+ """The following is a conversation between a highly knowledgeable and intelligent AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.\n\nThe conversation begins:\nUser:""",
34
+ """https://i1.pickpik.com/photos/515/906/355/kittens-cats-pet-cute-preview.jpg""",
35
+ """Describe this image.</s>\nAssistant: Five kittens are standing together in the center of the photograph. From the left to right, there is one orange kitte, two kittens white and black stripes, and two brown kittens with an orange head. They are in bright green grass and it looks like they are moving forward.</s>"""
36
+ ]
37
+
38
+ SYSTEM_PROMPT = """HELLO
39
+ AUREVOIR
40
+ User:<fake_token_around_image><image:AAA><fake_token_around_image><image:BBB><fake_token_around_image>BOJOUR</s>
41
+ Assistant: SPECIFIC</s>"""
42
+
43
+ def split_prompt_into_list(prompt_str):
44
+ prompt_splitted = prompt_str.split("<fake_token_around_image>")
45
+ prompt_list = []
46
+ for ps in prompt_splitted:
47
+ if ps.startswith("<image:"):
48
+ prompt_list.append(ps[7:-1])
49
+ else:
50
+ prompt_list.append(ps)
51
+ return prompt_list
52
+
53
+ prompts = split_prompt_into_list(formated_prompt)
54
+ i = processor(prompts, eval_mode=True)
55
+
56
+ from transformers import AutoTokenizer, AutoProcessor
57
+ checkpoint = "HuggingFaceM4/idefics-9b-instruct"
58
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
59
+ processor = AutoProcessor.from_pretrained(checkpoint)
60
+
61
+ prompts = [
62
+ [
63
+ "User:",
64
+ # "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg",
65
+ # "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg",
66
+ "Describe this image.\nAssistant: An image of two kittens in grass.\n",
67
+ "User:",
68
+ # "https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg",
69
+ "Describe this image.\nAssistant:",
70
+ ],
71
+ ]
72
+ inputs = processor(prompts, eval_mode=True)
73
+ tokenizer.decode(inputs["input_ids"][0])
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  --extra-index-url https://download.pytorch.org/whl/cu113
2
  torch
3
- transformers @ git+https://github.com/huggingface/transformers@0e35f259e454fee5c9a7925a525507164dd0685f
4
  requests
5
  pillow
6
  torchvision
 
1
  --extra-index-url https://download.pytorch.org/whl/cu113
2
  torch
3
+ transformers @ git+https://github.com/huggingface/transformers.git@786092a35e18154cacad62c30fe92bac2c27a1e1
4
  requests
5
  pillow
6
  torchvision