Spaces:

Soumen
/

image_to_text

Running

App Files Files Community

Soumen commited on Nov 5, 2022

Commit

02635d7

•

1 Parent(s): d69593b

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -14

app.py CHANGED Viewed

@@ -28,6 +28,20 @@ uploaded_photo = c2.file_uploader("Upload Image",type=['jpg','png','jpeg'], on_c
 feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
 tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
 camera_photo = c2.camera_input("Take a photo", on_change=change_photo_state)
 if choice == 'Caption':
     #st.subheader("Detection")
     if st.session_state["photo"]=="done":
@@ -37,20 +51,6 @@ if choice == 'Caption':
             our_image= load_image(camera_photo)
         elif uploaded_photo==None and camera_photo==None:
             our_image= load_image('image.jpg')
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        model.to(device)
-        max_length = 16
-        num_beams = 4
-        gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
-        def predict_step(our_image):
-            if our_image.mode != "RGB":
-                our_image = our_image.convert(mode="RGB")
-            pixel_values = feature_extractor(images=our_image, return_tensors="pt").pixel_values
-            pixel_values = pixel_values.to(device)
-            output_ids = model.generate(pixel_values, **gen_kwargs)
-            preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-            preds = [pred.strip() for pred in preds]
-            return preds
         st.success(predict_step(our_image))
 elif choice == 'About':
 	st.subheader("About Image Captioning App")

 feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
 tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
 camera_photo = c2.camera_input("Take a photo", on_change=change_photo_state)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+max_length = 16
+num_beams = 4
+gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
+def predict_step(our_image):
+   if our_image.mode != "RGB":
+      our_image = our_image.convert(mode="RGB")
+   pixel_values = feature_extractor(images=our_image, return_tensors="pt").pixel_values
+   pixel_values = pixel_values.to(device)
+   output_ids = model.generate(pixel_values, **gen_kwargs)
+   preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+   preds = [pred.strip() for pred in preds]
+   return preds
 if choice == 'Caption':
     #st.subheader("Detection")
     if st.session_state["photo"]=="done":
             our_image= load_image(camera_photo)
         elif uploaded_photo==None and camera_photo==None:
             our_image= load_image('image.jpg')
         st.success(predict_step(our_image))
 elif choice == 'About':
 	st.subheader("About Image Captioning App")