Fashion-Image-Captioning-using-BLIP-2

Sleeping

Upyaya commited on Jun 26, 2023

Commit

37db1ce

1 Parent(s): 3773d42

Fixed the issue to load the model for inferance in CPU device

The model is trained on GPU, with bitsandbytes, peft. But bitsandbytes does work only on GPU devices. So modify the inti model and input dtype to work on CPU

Files changed (1) hide show

app.py +29 -12

app.py CHANGED Viewed

@@ -4,25 +4,32 @@ from peft import PeftModel
 import streamlit as st
 from PIL import Image
 import torch
 preprocess_ckp = "Salesforce/blip2-opt-2.7b" #Checkpoint path used for perprocess image
 base_model_ckp = "./model/blip2-opt-2.7b-fp16-sharded" #Base model checkpoint path
 peft_model_ckp = "./model/blip2_peft" #PEFT model checkpoint path
 #init_model_required = True
-processor = None
-model = None
-def init_model():
     #if init_model_required:
-    #Preprocess input
-    processor = Blip2Processor.from_pretrained(preprocess_ckp)
-    #Model
-    model = Blip2ForConditionalGeneration.from_pretrained(base_model_ckp)#, load_in_8bit = True, device_map = "auto")
-    model = PeftModel.from_pretrained(model, peft_model_ckp)
         #init_model_required = False
@@ -32,10 +39,16 @@ def main():
     st.title("Fashion Image Caption using BLIP2")
-    init_model()
     file_name = st.file_uploader("Upload image")
     if file_name is not None:
         image_col, caption_text = st.columns(2)
@@ -45,7 +58,12 @@ def main():
         image_col.image(image, use_column_width = True)
         #Preprocess the image
-        inputs = processor(images = image, return_tensors = "pt").to('cuda', torch.float16)
         pixel_values = inputs.pixel_values
         #Predict the caption for the imahe
@@ -56,6 +74,5 @@ def main():
         caption_text.header("Generated Caption")
         caption_text.text(generated_caption)
 if __name__ == "__main__":
     main()

 import streamlit as st
 from PIL import Image
 import torch
+import os
 preprocess_ckp = "Salesforce/blip2-opt-2.7b" #Checkpoint path used for perprocess image
 base_model_ckp = "./model/blip2-opt-2.7b-fp16-sharded" #Base model checkpoint path
 peft_model_ckp = "./model/blip2_peft" #PEFT model checkpoint path
+sample_img_path = "./sample_images/"
 #init_model_required = True
+#processor = None
+#model = None
+#def init_model():
     #if init_model_required:
+#Preprocess input
+processor = Blip2Processor.from_pretrained(preprocess_ckp)
+#Model
+#Inferance on GPU device. Will give error in CPU system, as "load_in_8bit" is an setting of bitsandbytes library and only works for GPU
+#model = Blip2ForConditionalGeneration.from_pretrained(base_model_ckp, load_in_8bit = True, device_map = "auto")
+#Inferance on CPU device
+model = Blip2ForConditionalGeneration.from_pretrained(base_model_ckp)
+model = PeftModel.from_pretrained(model, peft_model_ckp)
         #init_model_required = False
     st.title("Fashion Image Caption using BLIP2")
+    #init_model()
+    #Select few sample images for the catagory of cloths
+    option = st.selectbox('Sample images ?', ('cap', 'tee', 'dress'))
     file_name = st.file_uploader("Upload image")
+    if file_name is None and option is not None:
+        file_name = os.join.path(sample_img_path, option)
     if file_name is not None:
         image_col, caption_text = st.columns(2)
         image_col.image(image, use_column_width = True)
         #Preprocess the image
+        #Inferance on GPU. When used this on GPU will get errors like: "slow_conv2d_cpu" not implemented for 'Half'" , " Input type (float) and bias type (struct c10::Half)"
+        #inputs = processor(images = image, return_tensors = "pt").to('cuda', torch.float16)
+        #Inferance on CPU
+        inputs = processor(images = image, return_tensors = "pt")
         pixel_values = inputs.pixel_values
         #Predict the caption for the imahe
         caption_text.header("Generated Caption")
         caption_text.text(generated_caption)
 if __name__ == "__main__":
     main()