pcuenq HF staff commited on
Commit
f160eaf
1 Parent(s): 5cc174c

Scaling fix + final weights (#1)

Browse files

- Workaround for scaling bug in transformers (d9a4d76f13ecd995b9b83e2ca93f890aa3878881)
- Use main branches (5e2122e233d1da68e93f0f3b2023c70e8b9521e4)

Files changed (2) hide show
  1. app.py +18 -6
  2. requirements.txt +1 -1
app.py CHANGED
@@ -1,23 +1,35 @@
1
  import gradio as gr
2
- import os
3
  import torch
4
  from transformers import FuyuForCausalLM, AutoTokenizer
5
  from transformers.models.fuyu.processing_fuyu import FuyuProcessor
6
  from transformers.models.fuyu.image_processing_fuyu import FuyuImageProcessor
 
7
 
8
  model_id = "adept/fuyu-8b"
9
- revision = "refs/pr/3"
10
  dtype = torch.bfloat16
11
  device = "cuda"
12
 
13
- tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
14
- model = FuyuForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=dtype, revision=revision)
15
  processor = FuyuProcessor(image_processor=FuyuImageProcessor(), tokenizer=tokenizer)
16
 
17
  caption_prompt = "Generate a coco-style caption.\\n"
18
 
 
 
 
 
 
 
 
 
 
 
 
19
  def predict(image, prompt):
20
  # image = image.convert('RGB')
 
 
21
  model_inputs = processor(text=prompt, images=[image])
22
  model_inputs = {k: v.to(dtype=dtype if torch.is_floating_point(v) else v.dtype, device=device) for k,v in model_inputs.items()}
23
 
@@ -57,7 +69,7 @@ with gr.Blocks(css=css) as demo:
57
  with gr.Tab("Visual Question Answering"):
58
  with gr.Row():
59
  with gr.Column():
60
- image_input = gr.Image(label="Upload your Image")
61
  text_input = gr.Textbox(label="Ask a Question")
62
  vqa_output = gr.Textbox(label="Output")
63
 
@@ -75,7 +87,7 @@ with gr.Blocks(css=css) as demo:
75
 
76
  with gr.Tab("Image Captioning"):
77
  with gr.Row():
78
- captioning_input = gr.Image(label="Upload your Image")
79
  captioning_output = gr.Textbox(label="Output")
80
  captioning_btn = gr.Button("Generate Caption")
81
 
 
1
  import gradio as gr
 
2
  import torch
3
  from transformers import FuyuForCausalLM, AutoTokenizer
4
  from transformers.models.fuyu.processing_fuyu import FuyuProcessor
5
  from transformers.models.fuyu.image_processing_fuyu import FuyuImageProcessor
6
+ from PIL import Image
7
 
8
  model_id = "adept/fuyu-8b"
 
9
  dtype = torch.bfloat16
10
  device = "cuda"
11
 
12
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
13
+ model = FuyuForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=dtype)
14
  processor = FuyuProcessor(image_processor=FuyuImageProcessor(), tokenizer=tokenizer)
15
 
16
  caption_prompt = "Generate a coco-style caption.\\n"
17
 
18
+ def resize_to_max(image, max_width=1920, max_height=1080):
19
+ width, height = image.size
20
+ if width <= max_width and height <= max_height:
21
+ return image
22
+
23
+ scale = min(max_width/width, max_height/height)
24
+ width = int(width*scale)
25
+ height = int(height*scale)
26
+
27
+ return image.resize((width, height), Image.LANCZOS)
28
+
29
  def predict(image, prompt):
30
  # image = image.convert('RGB')
31
+ image = resize_to_max(image)
32
+
33
  model_inputs = processor(text=prompt, images=[image])
34
  model_inputs = {k: v.to(dtype=dtype if torch.is_floating_point(v) else v.dtype, device=device) for k,v in model_inputs.items()}
35
 
 
69
  with gr.Tab("Visual Question Answering"):
70
  with gr.Row():
71
  with gr.Column():
72
+ image_input = gr.Image(label="Upload your Image", type="pil")
73
  text_input = gr.Textbox(label="Ask a Question")
74
  vqa_output = gr.Textbox(label="Output")
75
 
 
87
 
88
  with gr.Tab("Image Captioning"):
89
  with gr.Row():
90
+ captioning_input = gr.Image(label="Upload your Image", type="pil")
91
  captioning_output = gr.Textbox(label="Output")
92
  captioning_btn = gr.Button("Generate Caption")
93
 
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
- git+https://github.com/huggingface/transformers.git@add_fuyu_model
2
  accelerate
3
  torch==2.0.1
 
1
+ git+https://github.com/huggingface/transformers.git
2
  accelerate
3
  torch==2.0.1