Spaces:

HuggingFaceM4
/

idefics-8b

Running on Zero

App Files Files Community

merve HF staff commited on Apr 16

Commit

411cee3

•

1 Parent(s): add43e5

Update app.py

Browse files

Files changed (1) hide show

app.py +136 -231

app.py CHANGED Viewed

@@ -1,9 +1,7 @@
-import copy
 import gradio as gr
-from transformers import AutoProcessor, Idefics2ForConditionalGeneration, TextIteratorStreamer
-from threading import Thread
 import re
-import time
 from PIL import Image
 import torch
 import spaces
@@ -11,7 +9,7 @@ import subprocess
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
-PROCESSOR = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b")
 model = Idefics2ForConditionalGeneration.from_pretrained(
         "HuggingFaceM4/idefics2-8b",
@@ -19,117 +17,35 @@ model = Idefics2ForConditionalGeneration.from_pretrained(
         _attn_implementation="flash_attention_2",
         trust_remote_code=True).to("cuda")
-def turn_is_pure_media(turn):
-    return turn[1] is None
-def format_user_prompt_with_im_history_and_system_conditioning(
-    user_prompt, chat_history
 ):
-    """
-    Produces the resulting list that needs to go inside the processor.
-    It handles the potential image(s), the history and the system conditionning.
-    """
-    resulting_messages = copy.deepcopy([])
-    resulting_images = []
-    # Format history
-    for turn in chat_history:
-        if not resulting_messages or (resulting_messages and resulting_messages[-1]["role"] != "user"):
-            resulting_messages.append(
-                {
-                    "role": "user",
-                    "content": [],
-                }
-            )
-        if turn_is_pure_media(turn):
-            media = turn[0][0]
-            resulting_messages[-1]["content"].append({"type": "image"})
-            resulting_images.append(Image.open(media))
-        else:
-            user_utterance, assistant_utterance = turn
-            resulting_messages[-1]["content"].append(
-                {"type": "text", "text": user_utterance.strip()}
-            )
-            resulting_messages.append(
-                {
-                    "role": "assistant",
-                    "content": [
-                        {"type": "text", "text": user_utterance.strip()}
-                    ]
-                }
-            )
-    # Format current input
-    if not user_prompt["files"]:
-        resulting_messages.append(
             {
                 "role": "user",
-                "content": [
-                    {"type": "text", "text": user_prompt['text']}
-                ],
-            }
-        )
-    else:
-        # Choosing to put the image first (i.e. before the text), but this is an arbiratrary choice.
-        resulting_messages.append(
-            {
-                "role": "user",
-                "content": [{"type": "image"}] * len(user_prompt['files']) + [
-                    {"type": "text", "text": user_prompt['text']}
                 ]
             }
-        )
-        for im in user_prompt["files"]:
-          print(im)
-          if isinstance(im, str):
-            resulting_images.extend([Image.open(im)])
-          elif isinstance(im, dict):
-            resulting_images.extend([Image.open(im['path'])])
-    return resulting_messages, resulting_images
-def extract_images_from_msg_list(msg_list):
-    all_images = []
-    for msg in msg_list:
-        for c_ in msg["content"]:
-            if isinstance(c_, Image.Image):
-                all_images.append(c_)
-    return all_images
-@spaces.GPU(duration=180)
-def model_inference(
-    user_prompt,
-    chat_history,
-    decoding_strategy,
-    temperature,
-    max_new_tokens,
-    repetition_penalty,
-    top_p,
-):
-    if user_prompt["text"].strip() == "" and not user_prompt["files"]:
-        gr.Error("Please input a query and optionally image(s).")
-    if user_prompt["text"].strip() == "" and user_prompt["files"]:
-        gr.Error("Please input a text query along the image(s).")
-    streamer = TextIteratorStreamer(
-        PROCESSOR.tokenizer,
-        skip_prompt=True,
-        timeout=5.,
-    )
-    # Common parameters to all decoding strategies
-    # This documentation is useful to read: https://huggingface.co/docs/transformers/main/en/generation_strategies
     generation_args = {
         "max_new_tokens": max_new_tokens,
         "repetition_penalty": repetition_penalty,
-        "streamer": streamer,
     }
     assert decoding_strategy in [
@@ -143,133 +59,122 @@ def model_inference(
         generation_args["do_sample"] = True
         generation_args["top_p"] = top_p
-    # Creating model inputs
-    resulting_text, resulting_images = format_user_prompt_with_im_history_and_system_conditioning(
-        user_prompt=user_prompt,
-        chat_history=chat_history,
-    )
-    prompt = PROCESSOR.apply_chat_template(resulting_text, add_generation_prompt=True)
-    inputs = PROCESSOR(text=prompt, images=resulting_images if resulting_images else None, return_tensors="pt")
-    inputs = {k: v.to("cuda") for k, v in inputs.items()}
     generation_args.update(inputs)
-    thread = Thread(
-        target=model.generate,
-        kwargs=generation_args,
-    )
-    thread.start()
-    print("Start generating")
-    acc_text = ""
-    for text_token in streamer:
-        time.sleep(0.04)
-        acc_text += text_token
-        if acc_text.endswith("<end_of_utterance>"):
-            acc_text = acc_text[:-18]
-        yield acc_text
-    print("Success - generated the following text:", acc_text)
-    print("-----")
-BOT_AVATAR = "IDEFICS_logo.png"
-# Hyper-parameters for generation
-max_new_tokens = gr.Slider(
-    minimum=8,
-    maximum=1024,
-    value=512,
-    step=1,
-    interactive=True,
-    label="Maximum number of new tokens to generate",
-)
-repetition_penalty = gr.Slider(
-    minimum=0.01,
-    maximum=5.0,
-    value=1.2,
-    step=0.01,
-    interactive=True,
-    label="Repetition penalty",
-    info="1.0 is equivalent to no penalty",
-)
-decoding_strategy = gr.Radio(
-    [
-        "Greedy",
-        "Top P Sampling",
-    ],
-    value="Greedy",
-    label="Decoding strategy",
-    interactive=True,
-    info="Higher values is equivalent to sampling more low-probability tokens.",
-)
-temperature = gr.Slider(
-    minimum=0.0,
-    maximum=5.0,
-    value=0.4,
-    step=0.1,
-    interactive=True,
-    label="Sampling temperature",
-    info="Higher values will produce more diverse outputs.",
-)
-top_p = gr.Slider(
-    minimum=0.01,
-    maximum=0.99,
-    value=0.8,
-    step=0.01,
-    interactive=True,
-    label="Top P",
-    info="Higher values is equivalent to sampling more low-probability tokens.",
-)
-chatbot = gr.Chatbot(
-    label="Idefics2",
-    avatar_images=[None, BOT_AVATAR],
-    # height=750,
-)
-with gr.Blocks(fill_height=True, css=".message-wrap.svelte-1lcyrx4>div.svelte-1lcyrx4 img { width: auto; max-width: 30%; height: auto; max-height: 30%; }") as demo:
-    decoding_strategy.change(
-        fn=lambda selection: gr.Slider(
-            visible=(
-                selection in ["contrastive_sampling", "beam_sampling", "Top P Sampling", "sampling_top_k"]
-            )
-        ),
-        inputs=decoding_strategy,
-        outputs=temperature,
-    )
-    decoding_strategy.change(
-        fn=lambda selection: gr.Slider(
-            visible=(
-                selection in ["contrastive_sampling", "beam_sampling", "Top P Sampling", "sampling_top_k"]
-            )
-        ),
-        inputs=decoding_strategy,
-        outputs=repetition_penalty,
-    )
-    decoding_strategy.change(
-        fn=lambda selection: gr.Slider(visible=(selection in ["Top P Sampling"])),
-        inputs=decoding_strategy,
-        outputs=top_p,
-    )
-    description = "Try [IDEFICS2-8B](https://huggingface.co/HuggingFaceM4/idefics2-8b), the instruction fine-tuned IDEFICS2 in this demo. 💬 IDEFICS2 is a state-of-the-art vision language model in various benchmarks. To get started, upload an image and write a text prompt or try one of the examples. You can also play with advanced generation parameters. To learn more about IDEFICS2, read [the blog](https://huggingface.co/blog/idefics2). Note that this model is not as chatty as the upcoming chatty model, and it will give shorter answers."
-    gr.ChatInterface(
-        fn=model_inference,
-        chatbot=chatbot,
-        examples=[[{"text": "How many items are sold?", "files":["./example_images/docvqa_example.png"]}],
-                [{"text": "What is this UI about?", "files":["./example_images/s2w_example.png"]}],
-                [{"text": "I want to go somewhere similar to the one in the photo. Give me destinations and travel tips.", "files":["./example_images/example_images_travel_tips.jpg"]}],
-                [{"text": "Can you tell me a very short story based on this image?", "files":["./example_images/chicken_on_money.png"]}],
-                [{"text": "Where is this pastry from?", "files":["./example_images/baklava.png"]}],
-                [{"text": "How much percent is the order status?", "files":["./example_images/dummy_pdf.png"]}],
-                [{"text":"As an art critic AI assistant, could you describe this painting in details and make a thorough critic?.", "files":["./example_images/art_critic.png"]}]
                ],
-        description=description,
-        title="Idefics2 Playground 🐶 ",
-        multimodal=True,
-        additional_inputs=[decoding_strategy, temperature, max_new_tokens, repetition_penalty, top_p],
-    )
 demo.launch(debug=True)

 import gradio as gr
+from transformers import AutoProcessor, Idefics2ForConditionalGeneration
 import re
+import time
 from PIL import Image
 import torch
 import spaces
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
+processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b")
 model = Idefics2ForConditionalGeneration.from_pretrained(
         "HuggingFaceM4/idefics2-8b",
         _attn_implementation="flash_attention_2",
         trust_remote_code=True).to("cuda")
+@spaces.GPU
+def model_inference(
+    image, text, decoding_strategy, temperature,
+    max_new_tokens, repetition_penalty, top_p
 ):
+    if text == "" and not image:
+        gr.Error("Please input a query and optionally image(s).")
+    if text == "" and image:
+        gr.Error("Please input a text query along the image(s).")
+    resulting_messages = [
             {
                 "role": "user",
+                "content": [{"type": "image"}] + [
+                    {"type": "text", "text": text}
                 ]
             }
+        ]
+    prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
+    inputs = processor(text=prompt, images=[image], return_tensors="pt")
+    inputs = {k: v.to("cuda") for k, v in inputs.items()}
     generation_args = {
         "max_new_tokens": max_new_tokens,
         "repetition_penalty": repetition_penalty,
     }
     assert decoding_strategy in [
         generation_args["do_sample"] = True
         generation_args["top_p"] = top_p
     generation_args.update(inputs)
+    # Generate
+    generated_ids = model.generate(**generation_args)
+    generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
+    print(generated_texts)
+    pattern = r"Assistant: (.*)"
+    # Use regular expression to find the desired part
+    result = re.search(pattern, generated_texts[0]).group(1)
+    return result[:-1]
+with gr.Blocks(fill_height=True) as demo:
+    gr.Markdown("## IDEFICS2 Instruction 🐶")
+    gr.Markdown("Play with fine-tuned [IDEFICS2](https://huggingface.co/HuggingFaceM4/idefics2-8b) in this demo. To get started, upload an image and text or try one of the examples.")
+    gr.Markdown("**Important note**: This model is not made for chatting, the chatty IDEFICS2 will be released in the upcoming days. **This model is very strong on various tasks, including visual question answering, document retrieval and more.**")
+    gr.Markdown("Learn more about IDEFICS2 in this [blog post](https://huggingface.co/blog/idefics2).")
+    with gr.Row():
+      with gr.Column():
+        image_input = gr.Image(label="Upload your Image", type="pil")
+        query_input = gr.Textbox(label="Prompt")
+        submit_btn = gr.Button("Submit")
+      with gr.Column():
+        output = gr.Textbox(label="Output")
+    with gr.Accordion():
+      # Hyper-parameters for generation
+      max_new_tokens = gr.Slider(
+          minimum=8,
+          maximum=1024,
+          value=512,
+          step=1,
+          interactive=True,
+          label="Maximum number of new tokens to generate",
+      )
+      repetition_penalty = gr.Slider(
+          minimum=0.01,
+          maximum=5.0,
+          value=1.2,
+          step=0.01,
+          interactive=True,
+          label="Repetition penalty",
+          info="1.0 is equivalent to no penalty",
+      )
+      temperature = gr.Slider(
+          minimum=0.0,
+          maximum=5.0,
+          value=0.4,
+          step=0.1,
+          interactive=True,
+          label="Sampling temperature",
+          info="Higher values will produce more diverse outputs.",
+      )
+      top_p = gr.Slider(
+          minimum=0.01,
+          maximum=0.99,
+          value=0.8,
+          step=0.01,
+          interactive=True,
+          label="Top P",
+          info="Higher values is equivalent to sampling more low-probability tokens.",
+      )
+      decoding_strategy = gr.Radio(
+          [
+              "Greedy",
+              "Top P Sampling",
+          ],
+          value="Greedy",
+          label="Decoding strategy",
+          interactive=True,
+          info="Higher values is equivalent to sampling more low-probability tokens.",
+      )
+      decoding_strategy.change(
+          fn=lambda selection: gr.Slider(
+              visible=(
+                  selection in ["contrastive_sampling", "beam_sampling", "Top P Sampling", "sampling_top_k"]
+              )
+          ),
+          inputs=decoding_strategy,
+          outputs=temperature,
+      )
+      decoding_strategy.change(
+          fn=lambda selection: gr.Slider(
+              visible=(
+                  selection in ["contrastive_sampling", "beam_sampling", "Top P Sampling", "sampling_top_k"]
+              )
+          ),
+          inputs=decoding_strategy,
+          outputs=repetition_penalty,
+      )
+      decoding_strategy.change(
+          fn=lambda selection: gr.Slider(visible=(selection in ["Top P Sampling"])),
+          inputs=decoding_strategy,
+          outputs=top_p,
+      )
+      examples=[["./example_images/docvqa_example.png", "How many items are sold?", "Greedy", 0.4, 512, 1.2, 0.8],
+                ["./example_images/s2w_example.png", "What is this UI about?", "Greedy", 0.4, 512, 1.2, 0.8],
+                ["./example_images/example_images_travel_tips.jpg", "I want to go somewhere similar to the one in the photo. Give me destinations and travel tips.",  0.4, 512, 1.2, 0.8],
+                ["./example_images/chicken_on_money.png", "Can you tell me a very short story based on this image?", 0.4, 512, 1.2, 0.8],
+                ["./example_images/baklava.png", "Where is this pastry from?", 0.4, 512, 1.2, 0.8],
+                ["./example_images/dummy_pdf.png", "How much percent is the order status?", 0.4, 512, 1.2, 0.8],
+                ["./example_images/art_critic.png", "As an art critic AI assistant, could you describe this painting in details and make a thorough critic?.",
+                 0.4, 512, 1.2, 0.8]]
                ],
+      submit_btn.click(model_inference, inputs = [image_input, query_input, decoding_strategy, temperature,
+                                                  max_new_tokens, repetition_penalty, top_p],
+                       outputs=output)
 demo.launch(debug=True)