VictorSanh commited on
Commit
0e509eb
โ€ข
1 Parent(s): e26a679

Update visualization

Browse files
Files changed (1) hide show
  1. app_dialogue.py +63 -17
app_dialogue.py CHANGED
@@ -15,9 +15,9 @@ from text_generation import Client
15
  from transformers import AutoProcessor
16
 
17
 
18
- MODELS = [
19
  "HuggingFaceM4/idefics-9b-instruct",
20
- "HuggingFaceM4/idefics-80b-instruct",
21
  ]
22
 
23
  API_PATHS = {
@@ -66,7 +66,7 @@ API_TOKEN = os.getenv("HF_AUTH_TOKEN")
66
  IDEFICS_LOGO = "https://huggingface.co/spaces/HuggingFaceM4/idefics_playground/resolve/main/IDEFICS_logo.png"
67
 
68
  PROCESSOR = AutoProcessor.from_pretrained(
69
- "HuggingFaceM4/idefics-80b-instruct",
70
  token=API_TOKEN,
71
  )
72
 
@@ -314,7 +314,6 @@ textbox = gr.Textbox(
314
  visible=True,
315
  container=False,
316
  label="Text input",
317
- scale = 6
318
  )
319
  with gr.Blocks(title="IDEFICS Playground", theme=gr.themes.Base()) as demo:
320
  gr.HTML("""<h1 align="center">๐Ÿถ IDEFICS Playground - EMBARGO UNTIL AUGUST 22ND</h1>""") # TODO remove embargo
@@ -326,7 +325,7 @@ with gr.Blocks(title="IDEFICS Playground", theme=gr.themes.Base()) as demo:
326
  **EMBARGO UNTIL AUGUST 22ND** This demo showcaes **IDEFICS**, a open-access large visual lanugage model. Like GPT-4, the multimodal model accepts arbitrary sequences of image and text inputs and produces text outputs. IDEFICS can answer questions about images, describe visual content, create stories grounded in multiple images, etc.
327
  <br>IDEFICS (which stans for **I**mage-aware **D**ecoder **E**nhanced ร  la **F**lamingo with **I**nterleaved **C**ross-attention**S**) is an open-access reproduction of [Flamingo](https://huggingface.co/papers/2204.14198), a closed-source visual language model developed by Deepmind. IDEFICS was built solely on publicly available data and models. It is currently the only visual language model of this scale available in open-access.
328
 
329
- ๐Ÿ“š The variants available in this demo were fine-tuned on a mixture of supervised and intruction fine-tuning to make the models more suitable in conversational settings. For more details, we refer to our [blog post](https://huggingface.co/blog/introducing-idefics).
330
 
331
  ๐Ÿ…ฟ๏ธ **Intended uses:** This demo along with the [supporting models](https://huggingface.co/models?sort=trending&search=HuggingFaceM4%2Fidefics) are provided as research artefacts to the community. We detail misuses and out-of-scope uses [here](https://huggingface.co/HuggingFaceM4/idefics-80b#misuse-and-out-of-scope-use).
332
 
@@ -384,10 +383,15 @@ with gr.Blocks(title="IDEFICS Playground", theme=gr.themes.Base()) as demo:
384
 
385
  with gr.Group():
386
  with gr.Row():
 
387
  textbox.render()
 
388
  submit_btn = gr.Button(value="โ–ถ๏ธ Submit", visible=True)
 
389
  clear_btn = gr.ClearButton([textbox, imagebox, chatbot], value="๐Ÿงน Clear")
 
390
  regenerate_btn = gr.Button(value="๐Ÿ”„ Regenerate", visible=True)
 
391
  upload_btn = gr.UploadButton("๐Ÿ“ Upload image", file_types=["image"])
392
  # with gr.Group():
393
  # with gr.Row():
@@ -548,18 +552,60 @@ with gr.Blocks(title="IDEFICS Playground", theme=gr.themes.Base()) as demo:
548
  acc_text = ""
549
 
550
  def process_example(message, image):
551
- clear_msg, image_value, chat = model_inference(
552
- model_selector="HuggingFaceM4/idefics-80b-instruct",
553
- user_prompt_str=message,
554
- chat_history=[],
555
- image=image,
556
- decoding_strategy="Greedy",
557
- temperature=None,
558
- max_new_tokens=512,
559
- repetition_penalty=None,
560
- top_p=0.95,
 
 
 
 
 
 
 
 
 
 
561
  )
562
- return clear_msg, image_value, chat
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
 
564
  textbox.submit(
565
  fn=model_inference,
@@ -789,7 +835,7 @@ with gr.Blocks(title="IDEFICS Playground", theme=gr.themes.Base()) as demo:
789
  inputs=[textbox, imagebox],
790
  outputs=[textbox, imagebox, chatbot],
791
  fn=process_example,
792
- cache_examples=False,
793
  examples_per_page=6,
794
  label=(
795
  "Click on any example below to get started.\nFor convenience, the model generations have been"
 
15
  from transformers import AutoProcessor
16
 
17
 
18
+ MODELS = [ # TODO uncomment
19
  "HuggingFaceM4/idefics-9b-instruct",
20
+ # "HuggingFaceM4/idefics-80b-instruct",
21
  ]
22
 
23
  API_PATHS = {
 
66
  IDEFICS_LOGO = "https://huggingface.co/spaces/HuggingFaceM4/idefics_playground/resolve/main/IDEFICS_logo.png"
67
 
68
  PROCESSOR = AutoProcessor.from_pretrained(
69
+ "HuggingFaceM4/idefics-9b-instruct",
70
  token=API_TOKEN,
71
  )
72
 
 
314
  visible=True,
315
  container=False,
316
  label="Text input",
 
317
  )
318
  with gr.Blocks(title="IDEFICS Playground", theme=gr.themes.Base()) as demo:
319
  gr.HTML("""<h1 align="center">๐Ÿถ IDEFICS Playground - EMBARGO UNTIL AUGUST 22ND</h1>""") # TODO remove embargo
 
325
  **EMBARGO UNTIL AUGUST 22ND** This demo showcaes **IDEFICS**, a open-access large visual lanugage model. Like GPT-4, the multimodal model accepts arbitrary sequences of image and text inputs and produces text outputs. IDEFICS can answer questions about images, describe visual content, create stories grounded in multiple images, etc.
326
  <br>IDEFICS (which stans for **I**mage-aware **D**ecoder **E**nhanced ร  la **F**lamingo with **I**nterleaved **C**ross-attention**S**) is an open-access reproduction of [Flamingo](https://huggingface.co/papers/2204.14198), a closed-source visual language model developed by Deepmind. IDEFICS was built solely on publicly available data and models. It is currently the only visual language model of this scale available in open-access.
327
 
328
+ ๐Ÿ“š The variants available in this demo were fine-tuned on a mixture of supervised and intruction fine-tuning to make the models more suitable in conversational settings. For more details, we refer to our [blog post](TODO).
329
 
330
  ๐Ÿ…ฟ๏ธ **Intended uses:** This demo along with the [supporting models](https://huggingface.co/models?sort=trending&search=HuggingFaceM4%2Fidefics) are provided as research artefacts to the community. We detail misuses and out-of-scope uses [here](https://huggingface.co/HuggingFaceM4/idefics-80b#misuse-and-out-of-scope-use).
331
 
 
383
 
384
  with gr.Group():
385
  with gr.Row():
386
+ with gr.Column(scale=0.6):
387
  textbox.render()
388
+ with gr.Column(scale=0.1, min_width=80):
389
  submit_btn = gr.Button(value="โ–ถ๏ธ Submit", visible=True)
390
+ with gr.Column(scale=0.1, min_width=0):
391
  clear_btn = gr.ClearButton([textbox, imagebox, chatbot], value="๐Ÿงน Clear")
392
+ with gr.Column(scale=0.1, min_width=0):
393
  regenerate_btn = gr.Button(value="๐Ÿ”„ Regenerate", visible=True)
394
+ with gr.Column(scale=0.1, min_width=0):
395
  upload_btn = gr.UploadButton("๐Ÿ“ Upload image", file_types=["image"])
396
  # with gr.Group():
397
  # with gr.Row():
 
552
  acc_text = ""
553
 
554
  def process_example(message, image):
555
+ """
556
+ Same as `model_inference` but in greedy mode and with the 80b-instruct.
557
+ Specifically for pre-computing the default examples.
558
+ """
559
+ model_selector="HuggingFaceM4/idefics-80b-instruct"
560
+ user_prompt_str=message
561
+ chat_history=[]
562
+ decoding_strategy="Greedy"
563
+ max_new_tokens=512
564
+
565
+ formated_prompt_list, user_prompt_list = format_user_prompt_with_im_history_and_system_conditioning(
566
+ current_user_prompt_str=user_prompt_str.strip(),
567
+ current_image=image,
568
+ history=chat_history,
569
+ )
570
+
571
+ client_endpoint = API_PATHS[model_selector]
572
+ client = Client(
573
+ base_url=client_endpoint,
574
+ headers={"x-use-cache": "0", "Authorization": f"Bearer {API_TOKEN}"},
575
  )
576
+
577
+ # Common parameters to all decoding strategies
578
+ # This documentation is useful to read: https://huggingface.co/docs/transformers/main/en/generation_strategies
579
+ generation_args = {
580
+ "max_new_tokens": max_new_tokens,
581
+ "repetition_penalty": None,
582
+ "stop_sequences": EOS_STRINGS,
583
+ "do_sample": False,
584
+ }
585
+
586
+ if image is None:
587
+ # Case where there is no image OR the image is passed as `<fake_token_around_image><image:IMAGE_URL><fake_token_around_image>`
588
+ chat_history.append([prompt_list_to_markdown(user_prompt_list), ASSISTANT_PREPEND])
589
+ else:
590
+ # Case where the image is passed through the Image Box.
591
+ # Convert the image into base64 for both passing it through the chat history and
592
+ # displaying the image inside the same bubble as the text.
593
+ chat_history.append(
594
+ [
595
+ f"{prompt_list_to_markdown([image] + user_prompt_list)}",
596
+ ASSISTANT_PREPEND,
597
+ ]
598
+ )
599
+
600
+ query = prompt_list_to_tgi_input(formated_prompt_list)
601
+ generated_text = client.generate(prompt=query, **generation_args)
602
+ if generated_text.endswith("\nUser"):
603
+ generated_text = generate_text[:-5]
604
+
605
+ last_turn = chat_history.pop(-1)
606
+ last_turn[-1] += generated_text
607
+ chat_history.append(last_turn)
608
+ return "", None, chat_history
609
 
610
  textbox.submit(
611
  fn=model_inference,
 
835
  inputs=[textbox, imagebox],
836
  outputs=[textbox, imagebox, chatbot],
837
  fn=process_example,
838
+ cache_examples=True,
839
  examples_per_page=6,
840
  label=(
841
  "Click on any example below to get started.\nFor convenience, the model generations have been"