Spaces:

flax-community
/

koclip

Build error

App Files Files Community

jaketae commited on Jul 19, 2021

Commit

83d94a8

•

1 Parent(s): 2e45025

feature: replace comma separated input w/ counter ui

Browse files

Files changed (1) hide show

image2text.py +41 -26

image2text.py CHANGED Viewed

@@ -15,7 +15,7 @@ def app(model_name):
     st.title("Zero-shot Image Classification")
     st.markdown(
         """
-        This demonstration explores capability of KoCLIP in the field of Zero-Shot Prediction. This demo takes a set of image and captions from, and predicts the most likely label among the different captions given.
         KoCLIP is a retraining of OpenAI's CLIP model using 82,783 images from [MSCOCO](https://cocodataset.org/#home) dataset and Korean caption annotations. Korean translation of caption annotations were obtained from [AI Hub](https://aihub.or.kr/keti_data_board/visual_intelligence). Base model `koclip` uses `klue/roberta` as text encoder and `openai/clip-vit-base-patch32` as image encoder. Larger model `koclip-large` uses `klue/roberta` as text encoder and bigger `google/vit-large-patch16-224` as image encoder.
         """
@@ -27,32 +27,47 @@ def app(model_name):
     )
     query2 = st.file_uploader("or upload an image...", type=["jpg", "jpeg", "png"])
-    captions = st.text_input(
-        "Enter candidate captions in comma-separated form.",
-        value="귀여운 고양이,멋있는 강아지,포동포동한 햄스터",
-    )
-    if st.button("질문 (Query)"):
         if not any([query1, query2]):
             st.error("Please upload an image or paste an image URL.")
         else:
-            image_data = (
-                query2 if query2 is not None else requests.get(query1, stream=True).raw
-            )
-            image = Image.open(image_data)
-            st.image(image)
-            # captions = [caption.strip() for caption in captions.split(",")]
-            captions = [f"이것은 {caption.strip()}이다." for caption in captions.split(",")]
-            inputs = processor(
-                text=captions, images=image, return_tensors="jax", padding=True
-            )
-            inputs["pixel_values"] = jnp.transpose(
-                inputs["pixel_values"], axes=[0, 2, 3, 1]
-            )
-            outputs = model(**inputs)
-            probs = jax.nn.softmax(outputs.logits_per_image, axis=1)
-            score_dict = {captions[idx]: prob for idx, prob in enumerate(*probs)}
-            df = pd.DataFrame(score_dict.values(), index=score_dict.keys())
-            st.bar_chart(df)
-            # for idx, prob in sorted(enumerate(*probs), key=lambda x: x[1], reverse=True):
-            #     st.text(f"Score: `{prob}`, {captions[idx]}")

     st.title("Zero-shot Image Classification")
     st.markdown(
         """
+        This demonstration explores capability of KoCLIP in the field of Zero-Shot Prediction. This demo takes a set of image and captions from the user, and predicts the most likely label among the different captions given.
         KoCLIP is a retraining of OpenAI's CLIP model using 82,783 images from [MSCOCO](https://cocodataset.org/#home) dataset and Korean caption annotations. Korean translation of caption annotations were obtained from [AI Hub](https://aihub.or.kr/keti_data_board/visual_intelligence). Base model `koclip` uses `klue/roberta` as text encoder and `openai/clip-vit-base-patch32` as image encoder. Larger model `koclip-large` uses `klue/roberta` as text encoder and bigger `google/vit-large-patch16-224` as image encoder.
         """
     )
     query2 = st.file_uploader("or upload an image...", type=["jpg", "jpeg", "png"])
+    col1, col2 = st.beta_columns([3, 1])
+    with col2:
+        captions_count = st.selectbox(
+            "Number of labels", options=range(1, 6), index=2
+        )
+        compute = st.button("Classify")
+    with col1:
+        captions = []
+        defaults = ["귀여운 고양이", "멋있는 강아지", "포동포동한 햄스터"]
+        for idx in range(captions_count):
+            value = defaults[idx] if idx < len(defaults) else ""
+            captions.append(st.text_input(f"Insert label {idx+1}", value=value))
+    if compute:
         if not any([query1, query2]):
             st.error("Please upload an image or paste an image URL.")
         else:
+            st.markdown("""---""")
+            with st.spinner("Computing..."):
+                image_data = (
+                    query2 if query2 is not None else requests.get(query1, stream=True).raw
+                )
+                image = Image.open(image_data)
+                # captions = [caption.strip() for caption in captions.split(",")]
+                captions = [f"이것은 {caption.strip()}이다." for caption in captions]
+                inputs = processor(
+                    text=captions, images=image, return_tensors="jax", padding=True
+                )
+                inputs["pixel_values"] = jnp.transpose(
+                    inputs["pixel_values"], axes=[0, 2, 3, 1]
+                )
+                outputs = model(**inputs)
+                probs = jax.nn.softmax(outputs.logits_per_image, axis=1)
+                chart_data = pd.Series(probs[0], index=captions)
+                col1, col2 = st.beta_columns(2)
+                with col1:
+                    st.image(image)
+                with col2:
+                    st.bar_chart(chart_data)