Spaces:

adept
/

fuyu-8b-demo

Running on A10G

App Files Files Community

Added detailed captioning, increase `max_new_tokens` and fix escape character

by merve HF staff - opened Oct 20, 2023

base: refs/heads/main

←

from: refs/pr/8

Discussion Files changed

+16

-8

Files changed (1) hide show

app.py +16 -8

app.py CHANGED Viewed

@@ -9,11 +9,13 @@ model_id = "adept/fuyu-8b"
 dtype = torch.bfloat16
 device = "cuda"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = FuyuForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=dtype)
 processor = FuyuProcessor(image_processor=FuyuImageProcessor(), tokenizer=tokenizer)
-caption_prompt = "Generate a coco-style caption.\\n"
 def resize_to_max(image, max_width=1080, max_height=1080):
     width, height = image.size
@@ -33,12 +35,16 @@ def predict(image, prompt):
     model_inputs = processor(text=prompt, images=[image])
     model_inputs = {k: v.to(dtype=dtype if torch.is_floating_point(v) else v.dtype, device=device) for k,v in model_inputs.items()}
-    generation_output = model.generate(**model_inputs, max_new_tokens=40)
     prompt_len = model_inputs["input_ids"].shape[-1]
     return tokenizer.decode(generation_output[0][prompt_len:], skip_special_tokens=True)
-def caption(image):
-    return predict(image, caption_prompt)
 def set_example_image(example: list) -> dict:
     return gr.Image.update(value=example[0])
@@ -88,20 +94,22 @@ with gr.Blocks(css=css) as demo:
     with gr.Tab("Image Captioning"):
         with gr.Row():
-            captioning_input = gr.Image(label="Upload your Image", type="pil")
             captioning_output = gr.Textbox(label="Output")
         captioning_btn = gr.Button("Generate Caption")
         gr.Examples(
-            [["assets/captioning_example_1.png"], ["assets/captioning_example_2.png"]],
-            inputs = [captioning_input],
             outputs = [captioning_output],
             fn=caption,
             cache_examples=True,
             label='Click on any Examples below to get captioning results quickly 👇'
             )
-    captioning_btn.click(fn=caption, inputs=captioning_input, outputs=captioning_output)
     vqa_btn.click(fn=predict, inputs=[image_input, text_input], outputs=vqa_output)

 dtype = torch.bfloat16
 device = "cuda"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = FuyuForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=dtype)
 processor = FuyuProcessor(image_processor=FuyuImageProcessor(), tokenizer=tokenizer)
+CAPTION_PROMPT = "Generate a coco-style caption.\n"
+DETAILED_CAPTION_PROMPT = "What is happening in this image?"
 def resize_to_max(image, max_width=1080, max_height=1080):
     width, height = image.size
     model_inputs = processor(text=prompt, images=[image])
     model_inputs = {k: v.to(dtype=dtype if torch.is_floating_point(v) else v.dtype, device=device) for k,v in model_inputs.items()}
+    generation_output = model.generate(**model_inputs, max_new_tokens=50)
     prompt_len = model_inputs["input_ids"].shape[-1]
     return tokenizer.decode(generation_output[0][prompt_len:], skip_special_tokens=True)
+def caption(image, detailed_captioning):
+    if detailed_captioning:
+        caption_prompt = DETAILED_CAPTION_PROMPT
+    else:
+        caption_prompt = CAPTION_PROMPT
+    return predict(image, caption_prompt).lstrip()
 def set_example_image(example: list) -> dict:
     return gr.Image.update(value=example[0])
     with gr.Tab("Image Captioning"):
         with gr.Row():
+            with gr.Column():
+                captioning_input = gr.Image(label="Upload your Image", type="pil")
+                detailed_captioning_checkbox = gr.Checkbox(label="Enable detailed captioning")
             captioning_output = gr.Textbox(label="Output")
         captioning_btn = gr.Button("Generate Caption")
         gr.Examples(
+            [["assets/captioning_example_1.png", False], ["assets/captioning_example_2.png", True]],
+            inputs = [captioning_input, detailed_captioning_checkbox],
             outputs = [captioning_output],
             fn=caption,
             cache_examples=True,
             label='Click on any Examples below to get captioning results quickly 👇'
             )
+    captioning_btn.click(fn=caption, inputs=[captioning_input, detailed_captioning_checkbox], outputs=captioning_output)
     vqa_btn.click(fn=predict, inputs=[image_input, text_input], outputs=vqa_output)