Spaces:

clip-italian
/

clip-italian-demo

Running

App Files Files Community

g8a9 commited on Jul 25, 2021

Commit

bd6347d

•

1 Parent(s): 35772cf

re-wording, grammarlyfy, update front page emoji

Browse files

Files changed (3) hide show

README.md +1 -1
examples.py +2 -5
localization.py +30 -28

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Clip Italian Demo
-emoji: ⚡
 colorFrom: gray
 colorTo: pink
 sdk: streamlit

 ---
 title: Clip Italian Demo
+emoji: 🤌
 colorFrom: gray
 colorTo: pink
 sdk: streamlit

examples.py CHANGED Viewed

@@ -6,13 +6,10 @@ def app():
     st.title("Gallery")
     st.write(
         """
         Even though we trained the Italian CLIP model on way less examples than the original
-        OpenAI's CLIP, our training choices and quality datasets led to impressive results!
-        Here, we present some of **the most impressive text-image associations** learned by our model.
-        Remember you can head to the **Text to Image** section of the demo at any time to test your own🤌 Italian queries!
         """
     )

     st.title("Gallery")
     st.write(
         """
         Even though we trained the Italian CLIP model on way less examples than the original
+        OpenAI's CLIP, our training choices and quality datasets led to impressive results.
+        Here, we present some of them.
         """
     )

localization.py CHANGED Viewed

@@ -13,10 +13,14 @@ import jax
 import gc
-preprocess = transforms.Compose([
-    transforms.ToTensor(),
-    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
-])
 def pad_to_square(image, size=224):
@@ -50,19 +54,19 @@ def gen_image_batch(image_url, image_size=224, pixel_size=10):
     masks.append(mask)
     for i in range(0, n_pixels):
-        for j in range(i+1, n_pixels):
             m = mask.copy()
-            m[:min(i*pixel_size, image_size) + 1, :] = 0
-            m[min(j*pixel_size, image_size) + 1:, :] = 0
             neg_m = 1 - m
             image_batch.append(image * m + gray * neg_m)
             masks.append(m)
-    for i in range(0, n_pixels+1):
-        for j in range(i+1, n_pixels+1):
             m = mask.copy()
-            m[:, :min(i*pixel_size + 1, image_size)] = 0
-            m[:, min(j*pixel_size + 1, image_size):] = 0
             neg_m = 1 - m
             image_batch.append(image * m + gray * neg_m)
             masks.append(m)
@@ -75,7 +79,9 @@ def get_heatmap(image_url, text, pixel_size=10, iterations=3):
     model = get_model()
     image_size = model.config.vision_config.image_size
     text_embedding = text_encoder(text, model, tokenizer)
-    images, masks = gen_image_batch(image_url, image_size=image_size, pixel_size=pixel_size)
     input_image = images[0].copy()
     images = np.stack([preprocess(image) for image in images], axis=0)
@@ -106,10 +112,10 @@ def app():
         ### 👋 Ciao!
-        Here you can find an example for zero shot localization that will show you where in an image the model sees an object.
-        The location of the object is computed by masking different areas of the image and looking at
-        how the similarity to the image description changes. If you want to have a look at the implementation in details
         you can find it in [this Colab](https://colab.research.google.com/drive/10neENr1DEAFq_GzsLqBDo0gZ50hOhkOr?usp=sharing).
         On the two parameters: the pixel size defines the resolution of the localization map. A pixel size of 15 means
@@ -132,17 +138,12 @@ def app():
     MAX_ITER = 1
     col1, col2 = st.beta_columns([3, 1])
     with col2:
-        pixel_size = st.selectbox(
-            "Pixel Size", options=range(10, 21, 5), index=0
-        )
-        iterations = st.selectbox(
-            "Refinement Steps", options=range(3, 30, 3), index=0
-        )
         compute = st.button("LOCATE")
@@ -151,18 +152,19 @@ def app():
     if compute:
-        with st.spinner('Waiting for resources...'):
             sleep_time = 5
-            print('CPU_load', psutil.cpu_percent())
             while psutil.cpu_percent() > 60:
                 time.sleep(sleep_time)
         if not caption or not image_url:
             st.error("Please choose one image and at least one label")
         else:
-            with st.spinner("Computing... This might take up to a few minutes depending on the current load 😕  \n"
-                            "[Colab Link](https://colab.research.google.com/drive/10neENr1DEAFq_GzsLqBDo0gZ50hOhkOr?usp=sharing)"):
                 heatmap, image = get_heatmap(image_url, caption, pixel_size, iterations)
                 with col1:
@@ -172,7 +174,7 @@ def app():
         gc.collect()
     elif image_url:
-        image_raw = requests.get(image_url, stream=True, ).raw
         image = Image.open(image_raw).convert("RGB")
         with col1:
             st.image(image)

 import gc
+preprocess = transforms.Compose(
+    [
+        transforms.ToTensor(),
+        transforms.Normalize(
+            (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)
+        ),
+    ]
+)
 def pad_to_square(image, size=224):
     masks.append(mask)
     for i in range(0, n_pixels):
+        for j in range(i + 1, n_pixels):
             m = mask.copy()
+            m[: min(i * pixel_size, image_size) + 1, :] = 0
+            m[min(j * pixel_size, image_size) + 1 :, :] = 0
             neg_m = 1 - m
             image_batch.append(image * m + gray * neg_m)
             masks.append(m)
+    for i in range(0, n_pixels + 1):
+        for j in range(i + 1, n_pixels + 1):
             m = mask.copy()
+            m[:, : min(i * pixel_size + 1, image_size)] = 0
+            m[:, min(j * pixel_size + 1, image_size) :] = 0
             neg_m = 1 - m
             image_batch.append(image * m + gray * neg_m)
             masks.append(m)
     model = get_model()
     image_size = model.config.vision_config.image_size
     text_embedding = text_encoder(text, model, tokenizer)
+    images, masks = gen_image_batch(
+        image_url, image_size=image_size, pixel_size=pixel_size
+    )
     input_image = images[0].copy()
     images = np.stack([preprocess(image) for image in images], axis=0)
         ### 👋 Ciao!
+        Here you can find an example for zero-shot localization that will show you where in an image the model sees an object.
+        The object location is computed by masking different areas of the image and looking at
+        how the similarity to the image description changes. If you want to have a look at the implementation in detail,
         you can find it in [this Colab](https://colab.research.google.com/drive/10neENr1DEAFq_GzsLqBDo0gZ50hOhkOr?usp=sharing).
         On the two parameters: the pixel size defines the resolution of the localization map. A pixel size of 15 means
     MAX_ITER = 1
     col1, col2 = st.beta_columns([3, 1])
     with col2:
+        pixel_size = st.selectbox("Pixel Size", options=range(10, 21, 5), index=0)
+        iterations = st.selectbox("Refinement Steps", options=range(3, 30, 3), index=0)
         compute = st.button("LOCATE")
     if compute:
+        with st.spinner("Waiting for resources..."):
             sleep_time = 5
+            print("CPU_load", psutil.cpu_percent())
             while psutil.cpu_percent() > 60:
                 time.sleep(sleep_time)
         if not caption or not image_url:
             st.error("Please choose one image and at least one label")
         else:
+            with st.spinner(
+                "Computing... This might take up to a few minutes depending on the current load 😕  \n"
+                "[Colab Link](https://colab.research.google.com/drive/10neENr1DEAFq_GzsLqBDo0gZ50hOhkOr?usp=sharing)"
+            ):
                 heatmap, image = get_heatmap(image_url, caption, pixel_size, iterations)
                 with col1:
         gc.collect()
     elif image_url:
+        image_raw = requests.get(image_url, stream=True,).raw
         image = Image.open(image_raw).convert("RGB")
         with col1:
             st.image(image)