Spaces:

wiusdy
/

VQA_fashion_hvar

Sleeping

App Files Files Community

wiusdy commited on Feb 13

Commit

9c010ec

•

1 Parent(s): fc0c2dc

making the model comparison

Browse files

Files changed (2) hide show

app.py +4 -5
inference.py +24 -17

app.py CHANGED Viewed

@@ -7,17 +7,16 @@ inference = Inference()
 with gr.Blocks() as block:
-    options = gr.Dropdown(choices=["ViLT", "Blip Saffal", "Blip CN"], label="Models", info="Select the model to use..", )
-    # need to improve this one...
     txt = gr.Textbox(label="Insert a question..", lines=2)
-    txt_3 = gr.Textbox(value="", label="Your answer is here..")
     btn = gr.Button(value="Submit")
     dogs = os.path.join(os.path.dirname(__file__), "617.jpg")
     image = gr.Image(type="pil", value=dogs)
-    btn.click(inference.inference, inputs=[options, image, txt], outputs=[txt_3])
 if __name__ == "__main__":
     block.launch()

 with gr.Blocks() as block:
     txt = gr.Textbox(label="Insert a question..", lines=2)
+    outputs = [gr.outputs.Textbox(label="Answer from BLIP saffal model"), gr.outputs.Textbox(label="Answer from BLIP control net"),
+               gr.outputs.Textbox(label="Answer from ViLT saffal model"), gr.outputs.Textbox(label="Answer from ViLT control net")]
     btn = gr.Button(value="Submit")
     dogs = os.path.join(os.path.dirname(__file__), "617.jpg")
     image = gr.Image(type="pil", value=dogs)
+    btn.click(inference.inference, inputs=[image, txt], outputs=outputs)
 if __name__ == "__main__":
     block.launch()

inference.py CHANGED Viewed

@@ -6,31 +6,38 @@ import torch
 class Inference:
     def __init__(self):
         self.vilt_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
-        self.vilt_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
         self.blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
         self.blip_model_saffal = BlipForQuestionAnswering.from_pretrained("wiusdy/blip_pretrained_saffal_fashion_finetuning")
         self.blip_model_control_net = BlipForQuestionAnswering.from_pretrained("wiusdy/blip_pretrained_control_net_fashion_finetuning")
         logging.set_verbosity_info()
         self.logger = logging.get_logger("transformers")
-    def inference(self, selected, image, text):
-        self.logger.info(f"selected model {selected}")
-        if selected == "ViLT":
-            return self.__inference_vilt(image, text)
-        elif selected == "Blip Saffal":
-            return self.__inference_saffal_blip(image, text)
-        elif selected == "Blip CN":
-            return self.__inference_control_net_blip(image, text)
-        else:
-            self.logger.warning("Please select a model to make the inference..")
-    def __inference_vilt(self, image, text):
         encoding = self.vilt_processor(image, text, return_tensors="pt")
-        outputs = self.vilt_model(**encoding)
-        logits = outputs.logits
-        idx = logits.argmax(-1).item()
-        return f"{self.vilt_model.config.id2label[idx]}"
     def __inference_saffal_blip(self, image, text):
         encoding = self.blip_processor(image, text, return_tensors="pt")

 class Inference:
     def __init__(self):
         self.vilt_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
+        self.vilt_model_saffal = BlipForQuestionAnswering.from_pretrained("wiusdy/vilt_saffal_model")
+        self.vilt_model_control_net = BlipForQuestionAnswering.from_pretrained("wiusdy/vilt_control_net")
         self.blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
         self.blip_model_saffal = BlipForQuestionAnswering.from_pretrained("wiusdy/blip_pretrained_saffal_fashion_finetuning")
         self.blip_model_control_net = BlipForQuestionAnswering.from_pretrained("wiusdy/blip_pretrained_control_net_fashion_finetuning")
         logging.set_verbosity_info()
         self.logger = logging.get_logger("transformers")
+    def inference(self, image, text):
+        self.logger.info(f"Running inference for model ViLT Saffal")
+        ViLT_saffal_inference = self.__inference_vilt_saffal(image, text)
+        self.logger.info(f"Running inference for model ViLT Control Net")
+        ViLT_control_net_inference = self.__inference_vilt_control_net(image, text)
+        self.logger.info(f"Running inference for model BLIP Saffal")
+        BLIP_saffal_inference = self.__inference_saffal_blip(image, text)
+        self.logger.info(f"Running inference for model BLIP Control Net")
+        BLIP_control_net_inference = self.__inference_control_net_blip(image, text)
+        return BLIP_saffal_inference, BLIP_control_net_inference, ViLT_saffal_inference, ViLT_control_net_inference
+    def __inference_vilt_saffal(self, image, text):
+        encoding = self.vilt_processor(image, text, return_tensors="pt")
+        out = self.vilt_model_saffal.generate(**encoding)
+        generated_text = self.vilt_processor.decode(out[0], skip_special_tokens=True)
+        return f"{generated_text}"
+    def __inference_vilt_control_net(self, image, text):
         encoding = self.vilt_processor(image, text, return_tensors="pt")
+        out = self.vilt_model_control_net.generate(**encoding)
+        generated_text = self.vilt_processor.decode(out[0], skip_special_tokens=True)
+        return f"{generated_text}"
     def __inference_saffal_blip(self, image, text):
         encoding = self.blip_processor(image, text, return_tensors="pt")