image-caption-server

Paused

App Files Files Community

hysts HF Staff commited on Oct 21, 2023

Commit

bc904d0

1 Parent(s): d4d8571

Add files

Browse files

Files changed (3) hide show

README.md +1 -1
app.py +144 -3
requirements.txt +7 -0

README.md CHANGED Viewed

@@ -1,5 +1,4 @@
 ---
-license: mit
 title: InstructBLIP
 emoji: ⚡
 colorFrom: red
@@ -9,6 +8,7 @@ sdk_version: 3.50.2
 python_version: 3.10.13
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: InstructBLIP
 emoji: ⚡
 colorFrom: red
 python_version: 3.10.13
 app_file: app.py
 pinned: false
+license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,9 +1,150 @@
 #!/usr/bin/env python
 import gradio as gr
-with gr.Blocks() as demo:
-    pass
 if __name__ == "__main__":
-    demo.queue().launch()

 #!/usr/bin/env python
+from __future__ import annotations
+import os
 import gradio as gr
+import PIL.Image
+import spaces
+import torch
+from transformers import InstructBlipForConditionalGeneration, InstructBlipProcessor
+DESCRIPTION = "# InstructBLIP"
+MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "1024"))
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model_id = "Salesforce/instructblip-vicuna-7b"
+processor = InstructBlipProcessor.from_pretrained(model_id)
+model = InstructBlipForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
+@spaces.GPU
+def run(
+    image: PIL.Image.Image,
+    prompt: str,
+    text_decoding_method: str = "Nucleus sampling",
+    num_beams: int = 5,
+    max_length: int = 256,
+    min_length: int = 1,
+    top_p: float = 0.9,
+    repetition_penalty: float = 1.5,
+    length_penalty: float = 1.0,
+    temperature: float = 1.0,
+) -> str:
+    h, w = image.size
+    scale = MAX_IMAGE_SIZE / max(h, w)
+    if scale < 1:
+        new_w = int(w * scale)
+        new_h = int(h * scale)
+        image = image.resize((new_w, new_h), resample=PIL.Image.Resampling.LANCZOS)
+    inputs = processor(images=image, text=prompt, return_tensors="pt").to(device, torch.float16)
+    generated_ids = model.generate(
+        **inputs,
+        do_sample=text_decoding_method == "Nucleus sampling",
+        num_beams=num_beams,
+        max_length=max_length,
+        min_length=min_length,
+        top_p=top_p,
+        repetition_penalty=repetition_penalty,
+        length_penalty=length_penalty,
+        temperature=temperature,
+    )
+    generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+    return generated_caption
+with gr.Blocks(css="style.css") as demo:
+    gr.Markdown(DESCRIPTION)
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(type="pil")
+            prompt = gr.Textbox(label="Prompt")
+            run_button = gr.Button()
+            with gr.Accordion(label="Advanced options", open=False):
+                text_decoding_method = gr.Radio(
+                    label="Text Decoding Method",
+                    choices=["Beam search", "Nucleus sampling"],
+                    value="Nucleus sampling",
+                )
+                num_beams = gr.Slider(
+                    label="Number of Beams",
+                    minimum=1,
+                    maximum=10,
+                    step=1,
+                    value=5,
+                )
+                max_length = gr.Slider(
+                    label="Max Length",
+                    minimum=1,
+                    maximum=512,
+                    step=1,
+                    value=256,
+                )
+                min_length = gr.Slider(
+                    label="Minimum Length",
+                    minimum=1,
+                    maximum=64,
+                    step=1,
+                    value=1,
+                )
+                top_p = gr.Slider(
+                    label="Top P",
+                    minimum=0.1,
+                    maximum=1.0,
+                    step=0.1,
+                    value=0.9,
+                )
+                repetition_penalty = gr.Slider(
+                    label="Repetition Penalty",
+                    info="Larger value prevents repetition.",
+                    minimum=1.0,
+                    maximum=5.0,
+                    step=0.5,
+                    value=1.5,
+                )
+                length_penalty = gr.Slider(
+                    label="Length Penalty",
+                    info="Set to larger for longer sequence, used with beam search.",
+                    minimum=-1.0,
+                    maximum=2.0,
+                    step=0.2,
+                    value=1.0,
+                )
+                temperature = gr.Slider(
+                    label="Temperature",
+                    info="Used with nucleus sampling.",
+                    minimum=0.5,
+                    maximum=1.0,
+                    step=0.1,
+                    value=1.0,
+                )
+        with gr.Column():
+            output = gr.Textbox(label="Result")
+    gr.on(
+        triggers=[prompt.submit, run_button.click],
+        fn=run,
+        inputs=[
+            input_image,
+            prompt,
+            text_decoding_method,
+            num_beams,
+            max_length,
+            min_length,
+            top_p,
+            repetition_penalty,
+            length_penalty,
+            temperature,
+        ],
+        outputs=output,
+        api_name="run",
+    )
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch()

requirements.txt CHANGED Viewed

	@@ -0,0 +1,7 @@

+accelerate==0.23.0
+gradio==3.50.2
+Pillow==10.1.0
+spaces==0.16.3
+torch==2.0.0
+torchvision==0.15.1
+transformers==4.34.1