Spaces:

HuggingFaceM4
/

idefics_playground

Runtime error

App Files Files Community

VictorSanh commited on Aug 19, 2023

Commit

0e509eb

•

1 Parent(s): e26a679

Update visualization

Browse files

Files changed (1) hide show

app_dialogue.py +63 -17

app_dialogue.py CHANGED Viewed

@@ -15,9 +15,9 @@ from text_generation import Client
 from transformers import AutoProcessor
-MODELS = [
     "HuggingFaceM4/idefics-9b-instruct",
-    "HuggingFaceM4/idefics-80b-instruct",
 ]
 API_PATHS = {
@@ -66,7 +66,7 @@ API_TOKEN = os.getenv("HF_AUTH_TOKEN")
 IDEFICS_LOGO = "https://huggingface.co/spaces/HuggingFaceM4/idefics_playground/resolve/main/IDEFICS_logo.png"
 PROCESSOR = AutoProcessor.from_pretrained(
-    "HuggingFaceM4/idefics-80b-instruct",
     token=API_TOKEN,
 )
@@ -314,7 +314,6 @@ textbox = gr.Textbox(
     visible=True,
     container=False,
     label="Text input",
-    scale = 6
 )
 with gr.Blocks(title="IDEFICS Playground", theme=gr.themes.Base()) as demo:
     gr.HTML("""<h1 align="center">🐶 IDEFICS Playground - EMBARGO UNTIL AUGUST 22ND</h1>""")  # TODO remove embargo
@@ -326,7 +325,7 @@ with gr.Blocks(title="IDEFICS Playground", theme=gr.themes.Base()) as demo:
                 **EMBARGO UNTIL AUGUST 22ND** This demo showcaes **IDEFICS**, a open-access large visual lanugage model. Like GPT-4, the multimodal model accepts arbitrary sequences of image and text inputs and produces text outputs. IDEFICS can answer questions about images, describe visual content, create stories grounded in multiple images, etc.
                 <br>IDEFICS (which stans for **I**mage-aware **D**ecoder **E**nhanced à la **F**lamingo with **I**nterleaved **C**ross-attention**S**) is an open-access reproduction of [Flamingo](https://huggingface.co/papers/2204.14198), a closed-source visual language model developed by Deepmind. IDEFICS was built solely on publicly available data and models. It is currently the only visual language model of this scale available in open-access.
-                📚 The variants available in this demo were fine-tuned on a mixture of supervised and intruction fine-tuning to make the models more suitable in conversational settings. For more details, we refer to our [blog post](https://huggingface.co/blog/introducing-idefics).
                 🅿️ **Intended uses:** This demo along with the [supporting models](https://huggingface.co/models?sort=trending&search=HuggingFaceM4%2Fidefics) are provided as research artefacts to the community. We detail misuses and out-of-scope uses [here](https://huggingface.co/HuggingFaceM4/idefics-80b#misuse-and-out-of-scope-use).
@@ -384,10 +383,15 @@ with gr.Blocks(title="IDEFICS Playground", theme=gr.themes.Base()) as demo:
     with gr.Group():
         with gr.Row():
                 textbox.render()
                 submit_btn = gr.Button(value="▶️ Submit", visible=True)
                 clear_btn = gr.ClearButton([textbox, imagebox, chatbot], value="🧹 Clear")
                 regenerate_btn = gr.Button(value="🔄 Regenerate", visible=True)
                 upload_btn = gr.UploadButton("📁 Upload image", file_types=["image"])
     # with gr.Group():
     #     with gr.Row():
@@ -548,18 +552,60 @@ with gr.Blocks(title="IDEFICS Playground", theme=gr.themes.Base()) as demo:
             acc_text = ""
     def process_example(message, image):
-        clear_msg, image_value, chat = model_inference(
-            model_selector="HuggingFaceM4/idefics-80b-instruct",
-            user_prompt_str=message,
-            chat_history=[],
-            image=image,
-            decoding_strategy="Greedy",
-            temperature=None,
-            max_new_tokens=512,
-            repetition_penalty=None,
-            top_p=0.95,
         )
-        return clear_msg, image_value, chat
     textbox.submit(
         fn=model_inference,
@@ -789,7 +835,7 @@ with gr.Blocks(title="IDEFICS Playground", theme=gr.themes.Base()) as demo:
         inputs=[textbox, imagebox],
         outputs=[textbox, imagebox, chatbot],
         fn=process_example,
-        cache_examples=False,
         examples_per_page=6,
         label=(
             "Click on any example below to get started.\nFor convenience, the model generations have been"

 from transformers import AutoProcessor
+MODELS = [  # TODO uncomment
     "HuggingFaceM4/idefics-9b-instruct",
+    # "HuggingFaceM4/idefics-80b-instruct",
 ]
 API_PATHS = {
 IDEFICS_LOGO = "https://huggingface.co/spaces/HuggingFaceM4/idefics_playground/resolve/main/IDEFICS_logo.png"
 PROCESSOR = AutoProcessor.from_pretrained(
+    "HuggingFaceM4/idefics-9b-instruct",
     token=API_TOKEN,
 )
     visible=True,
     container=False,
     label="Text input",
 )
 with gr.Blocks(title="IDEFICS Playground", theme=gr.themes.Base()) as demo:
     gr.HTML("""<h1 align="center">🐶 IDEFICS Playground - EMBARGO UNTIL AUGUST 22ND</h1>""")  # TODO remove embargo
                 **EMBARGO UNTIL AUGUST 22ND** This demo showcaes **IDEFICS**, a open-access large visual lanugage model. Like GPT-4, the multimodal model accepts arbitrary sequences of image and text inputs and produces text outputs. IDEFICS can answer questions about images, describe visual content, create stories grounded in multiple images, etc.
                 <br>IDEFICS (which stans for **I**mage-aware **D**ecoder **E**nhanced à la **F**lamingo with **I**nterleaved **C**ross-attention**S**) is an open-access reproduction of [Flamingo](https://huggingface.co/papers/2204.14198), a closed-source visual language model developed by Deepmind. IDEFICS was built solely on publicly available data and models. It is currently the only visual language model of this scale available in open-access.
+                📚 The variants available in this demo were fine-tuned on a mixture of supervised and intruction fine-tuning to make the models more suitable in conversational settings. For more details, we refer to our [blog post](TODO).
                 🅿️ **Intended uses:** This demo along with the [supporting models](https://huggingface.co/models?sort=trending&search=HuggingFaceM4%2Fidefics) are provided as research artefacts to the community. We detail misuses and out-of-scope uses [here](https://huggingface.co/HuggingFaceM4/idefics-80b#misuse-and-out-of-scope-use).
     with gr.Group():
         with gr.Row():
+            with gr.Column(scale=0.6):
                 textbox.render()
+            with gr.Column(scale=0.1, min_width=80):
                 submit_btn = gr.Button(value="▶️ Submit", visible=True)
+            with gr.Column(scale=0.1, min_width=0):
                 clear_btn = gr.ClearButton([textbox, imagebox, chatbot], value="🧹 Clear")
+            with gr.Column(scale=0.1, min_width=0):
                 regenerate_btn = gr.Button(value="🔄 Regenerate", visible=True)
+            with gr.Column(scale=0.1, min_width=0):
                 upload_btn = gr.UploadButton("📁 Upload image", file_types=["image"])
     # with gr.Group():
     #     with gr.Row():
             acc_text = ""
     def process_example(message, image):
+        """
+        Same as `model_inference` but in greedy mode and with the 80b-instruct.
+        Specifically for pre-computing the default examples.
+        """
+        model_selector="HuggingFaceM4/idefics-80b-instruct"
+        user_prompt_str=message
+        chat_history=[]
+        decoding_strategy="Greedy"
+        max_new_tokens=512
+        formated_prompt_list, user_prompt_list = format_user_prompt_with_im_history_and_system_conditioning(
+            current_user_prompt_str=user_prompt_str.strip(),
+            current_image=image,
+            history=chat_history,
+        )
+        client_endpoint = API_PATHS[model_selector]
+        client = Client(
+            base_url=client_endpoint,
+            headers={"x-use-cache": "0", "Authorization": f"Bearer {API_TOKEN}"},
         )
+        # Common parameters to all decoding strategies
+        # This documentation is useful to read: https://huggingface.co/docs/transformers/main/en/generation_strategies
+        generation_args = {
+            "max_new_tokens": max_new_tokens,
+            "repetition_penalty": None,
+            "stop_sequences": EOS_STRINGS,
+            "do_sample": False,
+        }
+        if image is None:
+            # Case where there is no image OR the image is passed as `<fake_token_around_image><image:IMAGE_URL><fake_token_around_image>`
+            chat_history.append([prompt_list_to_markdown(user_prompt_list), ASSISTANT_PREPEND])
+        else:
+            # Case where the image is passed through the Image Box.
+            # Convert the image into base64 for both passing it through the chat history and
+            # displaying the image inside the same bubble as the text.
+            chat_history.append(
+                [
+                    f"{prompt_list_to_markdown([image] + user_prompt_list)}",
+                    ASSISTANT_PREPEND,
+                ]
+            )
+        query = prompt_list_to_tgi_input(formated_prompt_list)
+        generated_text = client.generate(prompt=query, **generation_args)
+        if generated_text.endswith("\nUser"):
+            generated_text = generate_text[:-5]
+        last_turn = chat_history.pop(-1)
+        last_turn[-1] += generated_text
+        chat_history.append(last_turn)
+        return "", None, chat_history
     textbox.submit(
         fn=model_inference,
         inputs=[textbox, imagebox],
         outputs=[textbox, imagebox, chatbot],
         fn=process_example,
+        cache_examples=True,
         examples_per_page=6,
         label=(
             "Click on any example below to get started.\nFor convenience, the model generations have been"