g8a9 commited on
Commit
bd6347d
β€’
1 Parent(s): 35772cf

re-wording, grammarlyfy, update front page emoji

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. examples.py +2 -5
  3. localization.py +30 -28
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Clip Italian Demo
3
- emoji: ⚑
4
  colorFrom: gray
5
  colorTo: pink
6
  sdk: streamlit
 
1
  ---
2
  title: Clip Italian Demo
3
+ emoji: 🀌
4
  colorFrom: gray
5
  colorTo: pink
6
  sdk: streamlit
examples.py CHANGED
@@ -6,13 +6,10 @@ def app():
6
  st.title("Gallery")
7
  st.write(
8
  """
9
-
10
 
11
  Even though we trained the Italian CLIP model on way less examples than the original
12
- OpenAI's CLIP, our training choices and quality datasets led to impressive results!
13
- Here, we present some of **the most impressive text-image associations** learned by our model.
14
-
15
- Remember you can head to the **Text to Image** section of the demo at any time to test your own🀌 Italian queries!
16
 
17
  """
18
  )
 
6
  st.title("Gallery")
7
  st.write(
8
  """
 
9
 
10
  Even though we trained the Italian CLIP model on way less examples than the original
11
+ OpenAI's CLIP, our training choices and quality datasets led to impressive results.
12
+ Here, we present some of them.
 
 
13
 
14
  """
15
  )
localization.py CHANGED
@@ -13,10 +13,14 @@ import jax
13
  import gc
14
 
15
 
16
- preprocess = transforms.Compose([
17
- transforms.ToTensor(),
18
- transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
19
- ])
 
 
 
 
20
 
21
 
22
  def pad_to_square(image, size=224):
@@ -50,19 +54,19 @@ def gen_image_batch(image_url, image_size=224, pixel_size=10):
50
  masks.append(mask)
51
 
52
  for i in range(0, n_pixels):
53
- for j in range(i+1, n_pixels):
54
  m = mask.copy()
55
- m[:min(i*pixel_size, image_size) + 1, :] = 0
56
- m[min(j*pixel_size, image_size) + 1:, :] = 0
57
  neg_m = 1 - m
58
  image_batch.append(image * m + gray * neg_m)
59
  masks.append(m)
60
 
61
- for i in range(0, n_pixels+1):
62
- for j in range(i+1, n_pixels+1):
63
  m = mask.copy()
64
- m[:, :min(i*pixel_size + 1, image_size)] = 0
65
- m[:, min(j*pixel_size + 1, image_size):] = 0
66
  neg_m = 1 - m
67
  image_batch.append(image * m + gray * neg_m)
68
  masks.append(m)
@@ -75,7 +79,9 @@ def get_heatmap(image_url, text, pixel_size=10, iterations=3):
75
  model = get_model()
76
  image_size = model.config.vision_config.image_size
77
  text_embedding = text_encoder(text, model, tokenizer)
78
- images, masks = gen_image_batch(image_url, image_size=image_size, pixel_size=pixel_size)
 
 
79
 
80
  input_image = images[0].copy()
81
  images = np.stack([preprocess(image) for image in images], axis=0)
@@ -106,10 +112,10 @@ def app():
106
 
107
  ### πŸ‘‹ Ciao!
108
 
109
- Here you can find an example for zero shot localization that will show you where in an image the model sees an object.
110
 
111
- The location of the object is computed by masking different areas of the image and looking at
112
- how the similarity to the image description changes. If you want to have a look at the implementation in details
113
  you can find it in [this Colab](https://colab.research.google.com/drive/10neENr1DEAFq_GzsLqBDo0gZ50hOhkOr?usp=sharing).
114
 
115
  On the two parameters: the pixel size defines the resolution of the localization map. A pixel size of 15 means
@@ -132,17 +138,12 @@ def app():
132
 
133
  MAX_ITER = 1
134
 
135
-
136
  col1, col2 = st.beta_columns([3, 1])
137
 
138
  with col2:
139
- pixel_size = st.selectbox(
140
- "Pixel Size", options=range(10, 21, 5), index=0
141
- )
142
 
143
- iterations = st.selectbox(
144
- "Refinement Steps", options=range(3, 30, 3), index=0
145
- )
146
 
147
  compute = st.button("LOCATE")
148
 
@@ -151,18 +152,19 @@ def app():
151
 
152
  if compute:
153
 
154
- with st.spinner('Waiting for resources...'):
155
  sleep_time = 5
156
- print('CPU_load', psutil.cpu_percent())
157
  while psutil.cpu_percent() > 60:
158
  time.sleep(sleep_time)
159
 
160
-
161
  if not caption or not image_url:
162
  st.error("Please choose one image and at least one label")
163
  else:
164
- with st.spinner("Computing... This might take up to a few minutes depending on the current load πŸ˜• \n"
165
- "[Colab Link](https://colab.research.google.com/drive/10neENr1DEAFq_GzsLqBDo0gZ50hOhkOr?usp=sharing)"):
 
 
166
  heatmap, image = get_heatmap(image_url, caption, pixel_size, iterations)
167
 
168
  with col1:
@@ -172,7 +174,7 @@ def app():
172
  gc.collect()
173
 
174
  elif image_url:
175
- image_raw = requests.get(image_url, stream=True, ).raw
176
  image = Image.open(image_raw).convert("RGB")
177
  with col1:
178
  st.image(image)
 
13
  import gc
14
 
15
 
16
+ preprocess = transforms.Compose(
17
+ [
18
+ transforms.ToTensor(),
19
+ transforms.Normalize(
20
+ (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)
21
+ ),
22
+ ]
23
+ )
24
 
25
 
26
  def pad_to_square(image, size=224):
 
54
  masks.append(mask)
55
 
56
  for i in range(0, n_pixels):
57
+ for j in range(i + 1, n_pixels):
58
  m = mask.copy()
59
+ m[: min(i * pixel_size, image_size) + 1, :] = 0
60
+ m[min(j * pixel_size, image_size) + 1 :, :] = 0
61
  neg_m = 1 - m
62
  image_batch.append(image * m + gray * neg_m)
63
  masks.append(m)
64
 
65
+ for i in range(0, n_pixels + 1):
66
+ for j in range(i + 1, n_pixels + 1):
67
  m = mask.copy()
68
+ m[:, : min(i * pixel_size + 1, image_size)] = 0
69
+ m[:, min(j * pixel_size + 1, image_size) :] = 0
70
  neg_m = 1 - m
71
  image_batch.append(image * m + gray * neg_m)
72
  masks.append(m)
 
79
  model = get_model()
80
  image_size = model.config.vision_config.image_size
81
  text_embedding = text_encoder(text, model, tokenizer)
82
+ images, masks = gen_image_batch(
83
+ image_url, image_size=image_size, pixel_size=pixel_size
84
+ )
85
 
86
  input_image = images[0].copy()
87
  images = np.stack([preprocess(image) for image in images], axis=0)
 
112
 
113
  ### πŸ‘‹ Ciao!
114
 
115
+ Here you can find an example for zero-shot localization that will show you where in an image the model sees an object.
116
 
117
+ The object location is computed by masking different areas of the image and looking at
118
+ how the similarity to the image description changes. If you want to have a look at the implementation in detail,
119
  you can find it in [this Colab](https://colab.research.google.com/drive/10neENr1DEAFq_GzsLqBDo0gZ50hOhkOr?usp=sharing).
120
 
121
  On the two parameters: the pixel size defines the resolution of the localization map. A pixel size of 15 means
 
138
 
139
  MAX_ITER = 1
140
 
 
141
  col1, col2 = st.beta_columns([3, 1])
142
 
143
  with col2:
144
+ pixel_size = st.selectbox("Pixel Size", options=range(10, 21, 5), index=0)
 
 
145
 
146
+ iterations = st.selectbox("Refinement Steps", options=range(3, 30, 3), index=0)
 
 
147
 
148
  compute = st.button("LOCATE")
149
 
 
152
 
153
  if compute:
154
 
155
+ with st.spinner("Waiting for resources..."):
156
  sleep_time = 5
157
+ print("CPU_load", psutil.cpu_percent())
158
  while psutil.cpu_percent() > 60:
159
  time.sleep(sleep_time)
160
 
 
161
  if not caption or not image_url:
162
  st.error("Please choose one image and at least one label")
163
  else:
164
+ with st.spinner(
165
+ "Computing... This might take up to a few minutes depending on the current load πŸ˜• \n"
166
+ "[Colab Link](https://colab.research.google.com/drive/10neENr1DEAFq_GzsLqBDo0gZ50hOhkOr?usp=sharing)"
167
+ ):
168
  heatmap, image = get_heatmap(image_url, caption, pixel_size, iterations)
169
 
170
  with col1:
 
174
  gc.collect()
175
 
176
  elif image_url:
177
+ image_raw = requests.get(image_url, stream=True,).raw
178
  image = Image.open(image_raw).convert("RGB")
179
  with col1:
180
  st.image(image)