BLIP2

Running on Zero

App Files Files Community

hysts HF staff commited on Oct 21, 2023

Commit

1cfb0d6

•

1 Parent(s): a5b3bac

Update

Browse files

Files changed (1) hide show

app.py +94 -39

app.py CHANGED Viewed

@@ -18,10 +18,12 @@ if not torch.cuda.is_available():
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 MODEL_ID_OPT_6_7B = "Salesforce/blip2-opt-6.7b"
 MODEL_ID_FLAN_T5_XXL = "Salesforce/blip2-flan-t5-xxl"
 MODEL_ID = os.getenv("MODEL_ID", MODEL_ID_FLAN_T5_XXL)
-assert MODEL_ID in [MODEL_ID_OPT_6_7B, MODEL_ID_FLAN_T5_XXL]
 if torch.cuda.is_available():
     processor = AutoProcessor.from_pretrained(MODEL_ID)
@@ -31,10 +33,14 @@ if torch.cuda.is_available():
 @spaces.GPU
 def generate_caption(
     image: PIL.Image.Image,
-    decoding_method: str,
-    temperature: float,
-    length_penalty: float,
-    repetition_penalty: float,
 ) -> str:
     inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
     generated_ids = model.generate(
@@ -43,10 +49,10 @@ def generate_caption(
         temperature=temperature,
         length_penalty=length_penalty,
         repetition_penalty=repetition_penalty,
-        max_length=50,
-        min_length=1,
-        num_beams=5,
-        top_p=0.9,
     )
     result = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
     return result
@@ -55,23 +61,27 @@ def generate_caption(
 @spaces.GPU
 def answer_question(
     image: PIL.Image.Image,
-    text: str,
-    decoding_method: str,
-    temperature: float,
-    length_penalty: float,
-    repetition_penalty: float,
 ) -> str:
-    inputs = processor(images=image, text=text, return_tensors="pt").to(device, torch.float16)
     generated_ids = model.generate(
         **inputs,
         do_sample=decoding_method == "Nucleus sampling",
         temperature=temperature,
         length_penalty=length_penalty,
         repetition_penalty=repetition_penalty,
-        max_length=30,
-        min_length=1,
-        num_beams=5,
-        top_p=0.9,
     )
     result = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
     return result
@@ -86,10 +96,14 @@ def postprocess_output(output: str) -> str:
 def chat(
     image: PIL.Image.Image,
     text: str,
-    decoding_method: str,
-    temperature: float,
-    length_penalty: float,
-    repetition_penalty: float,
     history_orig: list[str] = [],
     history_qa: list[str] = [],
 ) -> tuple[list[tuple[str, str]], list[str], list[str]]:
@@ -99,12 +113,16 @@ def chat(
     prompt = " ".join(history_qa)
     output = answer_question(
-        image,
-        prompt,
-        decoding_method,
-        temperature,
-        length_penalty,
-        repetition_penalty,
     )
     output = postprocess_output(output)
     history_orig.append(output)
@@ -160,7 +178,7 @@ with gr.Blocks(css="style.css") as demo:
                     clear_chat_button = gr.Button("Clear")
                     chat_button = gr.Button("Submit", variant="primary")
     with gr.Accordion(label="Advanced settings", open=False):
-        sampling_method = gr.Radio(
             label="Text Decoding Method",
             choices=["Beam search", "Nucleus sampling"],
             value="Nucleus sampling",
@@ -170,24 +188,53 @@ with gr.Blocks(css="style.css") as demo:
             info="Used with nucleus sampling.",
             minimum=0.5,
             maximum=1.0,
-            value=1.0,
             step=0.1,
         )
         length_penalty = gr.Slider(
             label="Length Penalty",
             info="Set to larger for longer sequence, used with beam search.",
             minimum=-1.0,
             maximum=2.0,
-            value=1.0,
             step=0.2,
         )
-        rep_penalty = gr.Slider(
-            label="Repeat Penalty",
             info="Larger value prevents repetition.",
             minimum=1.0,
             maximum=5.0,
-            value=1.5,
             step=0.5,
         )
     gr.Examples(
@@ -199,10 +246,14 @@ with gr.Blocks(css="style.css") as demo:
         fn=generate_caption,
         inputs=[
             image,
-            sampling_method,
             temperature,
             length_penalty,
-            rep_penalty,
         ],
         outputs=caption_output,
         api_name="caption",
@@ -211,10 +262,14 @@ with gr.Blocks(css="style.css") as demo:
     chat_inputs = [
         image,
         vqa_input,
-        sampling_method,
         temperature,
         length_penalty,
-        rep_penalty,
         history_orig,
         history_qa,
     ]

 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+MODEL_ID_OPT_2_7B = "Salesforce/blip2-opt-2.7b"
 MODEL_ID_OPT_6_7B = "Salesforce/blip2-opt-6.7b"
+MODEL_ID_FLAN_T5_XL = "Salesforce/blip2-flan-t5-xl"
 MODEL_ID_FLAN_T5_XXL = "Salesforce/blip2-flan-t5-xxl"
 MODEL_ID = os.getenv("MODEL_ID", MODEL_ID_FLAN_T5_XXL)
+assert MODEL_ID in [MODEL_ID_OPT_2_7B, MODEL_ID_OPT_6_7B, MODEL_ID_FLAN_T5_XL, MODEL_ID_FLAN_T5_XXL]
 if torch.cuda.is_available():
     processor = AutoProcessor.from_pretrained(MODEL_ID)
 @spaces.GPU
 def generate_caption(
     image: PIL.Image.Image,
+    decoding_method: str = "Nucleus sampling",
+    temperature: float = 1.0,
+    length_penalty: float = 1.0,
+    repetition_penalty: float = 1.5,
+    max_length: int = 50,
+    min_length: int = 1,
+    num_beams: int = 5,
+    top_p: float = 0.9,
 ) -> str:
     inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
     generated_ids = model.generate(
         temperature=temperature,
         length_penalty=length_penalty,
         repetition_penalty=repetition_penalty,
+        max_length=max_length,
+        min_length=min_length,
+        num_beams=num_beams,
+        top_p=top_p,
     )
     result = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
     return result
 @spaces.GPU
 def answer_question(
     image: PIL.Image.Image,
+    prompt: str,
+    decoding_method: str = "Nucleus sampling",
+    temperature: float = 1.0,
+    length_penalty: float = 1.0,
+    repetition_penalty: float = 1.5,
+    max_length: int = 50,
+    min_length: int = 1,
+    num_beams: int = 5,
+    top_p: float = 0.9,
 ) -> str:
+    inputs = processor(images=image, text=prompt, return_tensors="pt").to(device, torch.float16)
     generated_ids = model.generate(
         **inputs,
         do_sample=decoding_method == "Nucleus sampling",
         temperature=temperature,
         length_penalty=length_penalty,
         repetition_penalty=repetition_penalty,
+        max_length=max_length,
+        min_length=min_length,
+        num_beams=num_beams,
+        top_p=top_p,
     )
     result = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
     return result
 def chat(
     image: PIL.Image.Image,
     text: str,
+    decoding_method: str = "Nucleus sampling",
+    temperature: float = 1.0,
+    length_penalty: float = 1.0,
+    repetition_penalty: float = 1.5,
+    max_length: int = 50,
+    min_length: int = 1,
+    num_beams: int = 5,
+    top_p: float = 0.9,
     history_orig: list[str] = [],
     history_qa: list[str] = [],
 ) -> tuple[list[tuple[str, str]], list[str], list[str]]:
     prompt = " ".join(history_qa)
     output = answer_question(
+        image=image,
+        prompt=prompt,
+        decoding_method=decoding_method,
+        temperature=temperature,
+        length_penalty=length_penalty,
+        repetition_penalty=repetition_penalty,
+        max_length=max_length,
+        min_length=min_length,
+        num_beams=num_beams,
+        top_p=top_p,
     )
     output = postprocess_output(output)
     history_orig.append(output)
                     clear_chat_button = gr.Button("Clear")
                     chat_button = gr.Button("Submit", variant="primary")
     with gr.Accordion(label="Advanced settings", open=False):
+        text_decoding_method = gr.Radio(
             label="Text Decoding Method",
             choices=["Beam search", "Nucleus sampling"],
             value="Nucleus sampling",
             info="Used with nucleus sampling.",
             minimum=0.5,
             maximum=1.0,
             step=0.1,
+            value=1.0,
         )
         length_penalty = gr.Slider(
             label="Length Penalty",
             info="Set to larger for longer sequence, used with beam search.",
             minimum=-1.0,
             maximum=2.0,
             step=0.2,
+            value=1.0,
         )
+        repetition_penalty = gr.Slider(
+            label="Repetition Penalty",
             info="Larger value prevents repetition.",
             minimum=1.0,
             maximum=5.0,
             step=0.5,
+            value=1.5,
+        )
+        max_length = gr.Slider(
+            label="Max Length",
+            minimum=1,
+            maximum=512,
+            step=1,
+            value=50,
+        )
+        min_length = gr.Slider(
+            label="Minimum Length",
+            minimum=1,
+            maximum=100,
+            step=1,
+            value=1,
+        )
+        num_beams = gr.Slider(
+            label="Number of Beams",
+            minimum=1,
+            maximum=10,
+            step=1,
+            value=5,
+        )
+        top_p = gr.Slider(
+            label="Top P",
+            info="Used with nucleus sampling.",
+            minimum=0.5,
+            maximum=1.0,
+            step=0.1,
+            value=0.9,
         )
     gr.Examples(
         fn=generate_caption,
         inputs=[
             image,
+            text_decoding_method,
             temperature,
             length_penalty,
+            repetition_penalty,
+            max_length,
+            min_length,
+            num_beams,
+            top_p,
         ],
         outputs=caption_output,
         api_name="caption",
     chat_inputs = [
         image,
         vqa_input,
+        text_decoding_method,
         temperature,
         length_penalty,
+        repetition_penalty,
+        max_length,
+        min_length,
+        num_beams,
+        top_p,
         history_orig,
         history_qa,
     ]