Spaces:

adept
/

fuyu-8b-demo

Runtime error

pcuenq HF Staff commited on Oct 19, 2023

Commit

f160eaf

1 Parent(s): 5cc174c

Scaling fix + final weights (#1)

- Workaround for scaling bug in transformers (d9a4d76f13ecd995b9b83e2ca93f890aa3878881)
- Use main branches (5e2122e233d1da68e93f0f3b2023c70e8b9521e4)

Files changed (2) hide show

app.py +18 -6
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -1,23 +1,35 @@
 import gradio as gr
-import os
 import torch
 from transformers import FuyuForCausalLM, AutoTokenizer
 from transformers.models.fuyu.processing_fuyu import FuyuProcessor
 from transformers.models.fuyu.image_processing_fuyu import FuyuImageProcessor
 model_id = "adept/fuyu-8b"
-revision = "refs/pr/3"
 dtype = torch.bfloat16
 device = "cuda"
-tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
-model = FuyuForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=dtype, revision=revision)
 processor = FuyuProcessor(image_processor=FuyuImageProcessor(), tokenizer=tokenizer)
 caption_prompt = "Generate a coco-style caption.\\n"
 def predict(image, prompt):
     # image = image.convert('RGB')
     model_inputs = processor(text=prompt, images=[image])
     model_inputs = {k: v.to(dtype=dtype if torch.is_floating_point(v) else v.dtype, device=device) for k,v in model_inputs.items()}
@@ -57,7 +69,7 @@ with gr.Blocks(css=css) as demo:
     with gr.Tab("Visual Question Answering"):
         with gr.Row():
             with gr.Column():
-                image_input = gr.Image(label="Upload your Image")
                 text_input = gr.Textbox(label="Ask a Question")
             vqa_output = gr.Textbox(label="Output")
@@ -75,7 +87,7 @@ with gr.Blocks(css=css) as demo:
     with gr.Tab("Image Captioning"):
         with gr.Row():
-            captioning_input = gr.Image(label="Upload your Image")
             captioning_output = gr.Textbox(label="Output")
         captioning_btn = gr.Button("Generate Caption")

 import gradio as gr
 import torch
 from transformers import FuyuForCausalLM, AutoTokenizer
 from transformers.models.fuyu.processing_fuyu import FuyuProcessor
 from transformers.models.fuyu.image_processing_fuyu import FuyuImageProcessor
+from PIL import Image
 model_id = "adept/fuyu-8b"
 dtype = torch.bfloat16
 device = "cuda"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = FuyuForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=dtype)
 processor = FuyuProcessor(image_processor=FuyuImageProcessor(), tokenizer=tokenizer)
 caption_prompt = "Generate a coco-style caption.\\n"
+def resize_to_max(image, max_width=1920, max_height=1080):
+    width, height = image.size
+    if width <= max_width and height <= max_height:
+        return image
+    scale = min(max_width/width, max_height/height)
+    width = int(width*scale)
+    height = int(height*scale)
+    return image.resize((width, height), Image.LANCZOS)
 def predict(image, prompt):
     # image = image.convert('RGB')
+    image = resize_to_max(image)
     model_inputs = processor(text=prompt, images=[image])
     model_inputs = {k: v.to(dtype=dtype if torch.is_floating_point(v) else v.dtype, device=device) for k,v in model_inputs.items()}
     with gr.Tab("Visual Question Answering"):
         with gr.Row():
             with gr.Column():
+                image_input = gr.Image(label="Upload your Image", type="pil")
                 text_input = gr.Textbox(label="Ask a Question")
             vqa_output = gr.Textbox(label="Output")
     with gr.Tab("Image Captioning"):
         with gr.Row():
+            captioning_input = gr.Image(label="Upload your Image", type="pil")
             captioning_output = gr.Textbox(label="Output")
         captioning_btn = gr.Button("Generate Caption")

requirements.txt CHANGED Viewed

@@ -1,3 +1,3 @@
-git+https://github.com/huggingface/transformers.git@add_fuyu_model
 accelerate
 torch==2.0.1

+git+https://github.com/huggingface/transformers.git
 accelerate
 torch==2.0.1