Spaces:

clip-italian
/

clip-italian-demo

Running

App Files Files Community

4rtemi5 commited on Jul 24, 2021

Commit

a9e905c

1 Parent(s): 5701708

fix some stuff

Browse files

Files changed (6) hide show

app.py +1 -2
examples.py +4 -7
home.py +0 -2
image2text.py +11 -8
introduction.md +3 -4
text2image.py +11 -12

app.py CHANGED Viewed

@@ -15,8 +15,7 @@ PAGES = {
 st.sidebar.title("Explore our CLIP-Italian demo")
 logo = Image.open("static/img/clip_italian_logo.png")
-st.sidebar.image(logo)
-#, caption="CLIP-Italian logo"
 page = st.sidebar.radio("", list(PAGES.keys()))
 PAGES[page].app()

 st.sidebar.title("Explore our CLIP-Italian demo")
 logo = Image.open("static/img/clip_italian_logo.png")
+st.sidebar.image(logo, caption="CLIP-Italian logo")
 page = st.sidebar.radio("", list(PAGES.keys()))
 PAGES[page].app()

examples.py CHANGED Viewed

@@ -3,16 +3,13 @@ import streamlit as st
 def app():
-    #st.title("Examples & Applications")
-    st.markdown("<h1 style='text-align: center; color: #CD212A;'> Examples & Applications </h1>", unsafe_allow_html=True)
-    st.markdown("<h2 style='text-align: center; color: #008C45; font-weight:bold;'> Complex Queries -Image Retrieval </h2>", unsafe_allow_html=True)
     st.write(
         """
-        Even though we trained the Italian CLIP model on way less examples(~1.4M) than the original
-        OpenAI's CLIP (~400M), our training choices and quality datasets led to impressive results!
         Here, we present some of **the most impressive text-image associations** learned by our model.
         Remember you can head to the **Text to Image** section of the demo at any time to test your own🤌 Italian queries!
@@ -59,4 +56,4 @@ def app():
                 "Is the DALLE-mini logo an *avocado* or an armchair (*poltrona*)?")
     st.image("static/img/examples/dalle_mini.png")
-    st.markdown("It seems it's half an armchair and half an avocado! We thank the DALLE-mini team for the great idea :)")

 def app():
+    st.title("Examples & Applications")
     st.write(
         """
+        Even though we trained the Italian CLIP model on way less examples than the original
+        OpenAI's CLIP, our training choices and quality datasets led to impressive results!
         Here, we present some of **the most impressive text-image associations** learned by our model.
         Remember you can head to the **Text to Image** section of the demo at any time to test your own🤌 Italian queries!
                 "Is the DALLE-mini logo an *avocado* or an armchair (*poltrona*)?")
     st.image("static/img/examples/dalle_mini.png")
+    st.markdown("It seems it's half an armchair and half an avocado! We thank the DALL-E mini team for the great idea :)")

home.py CHANGED Viewed

@@ -7,7 +7,5 @@ def read_markdown_file(markdown_file):
 def app():
-    st.markdown("<h1 style='text-align: center; color: #CD212A;'> CLIP-Italian </h1>", unsafe_allow_html=True)
     intro_markdown = read_markdown_file("introduction.md")
     st.markdown(intro_markdown, unsafe_allow_html=True)

 def app():
     intro_markdown = read_markdown_file("introduction.md")
     st.markdown(intro_markdown, unsafe_allow_html=True)

image2text.py CHANGED Viewed

@@ -10,22 +10,25 @@ import gc
 def app():
-    #st.title("From Image to Text")
-    st.markdown("<h1 style='text-align: center; color: #CD212A;'> Zero Shot Image Classification </h1>", unsafe_allow_html=True)
-    st.markdown("<h2 style='text-align: center; color: #008C45; font-weight:bold;'> Image to Text </h2>", unsafe_allow_html=True)
     st.markdown(
         """
-        👋 Ciao! Here you can find the captions or the labels that are most related to a given image.
-        Try typing "gatto" (cat) in the space for label1 and "cane" (dog) in the space for label2 and click
         "classify"!
         """
     )
     image_url = st.text_input(
-        "YOU CAN INPUT THE URL OF AN IMAGE : ",
         value="https://www.petdetective.it/wp-content/uploads/2016/04/gatto-toilette.jpg",
     )
@@ -35,14 +38,14 @@ def app():
     with col2:
         captions_count = st.selectbox(
-            "NUMBER OF LABELS", options=range(1, MAX_CAP + 1), index=1
         )
         compute = st.button("CLASSIFY")
     with col1:
         captions = list()
         for idx in range(min(MAX_CAP, captions_count)):
-            captions.append(st.text_input(f"INSERT LABEL {idx+1}"))
     if compute:
         captions = [c for c in captions if c != ""]

 def app():
+    st.title("From Image to Text")
     st.markdown(
         """
+        ### 👋 Ciao!
+        Here you can find the captions or the labels that are most related to a given image. It is a zero-shot
+        image classification task!
+        🤌 Italian mode on! 🤌
+        For example, try typing "gatto" (cat) in the space for label1 and "cane" (dog) in the space for label2 and click
         "classify"!
         """
     )
     image_url = st.text_input(
+        "You can input the URL of an image",
         value="https://www.petdetective.it/wp-content/uploads/2016/04/gatto-toilette.jpg",
     )
     with col2:
         captions_count = st.selectbox(
+            "Number of labels", options=range(1, MAX_CAP + 1), index=1
         )
         compute = st.button("CLASSIFY")
     with col1:
         captions = list()
         for idx in range(min(MAX_CAP, captions_count)):
+            captions.append(st.text_input(f"Insert label {idx+1}"))
     if compute:
         captions = [c for c in captions if c != ""]

introduction.md CHANGED Viewed

@@ -1,10 +1,9 @@
-CLIP-Italian is a **multimodal** model trained on **~1.4 Million** Italian text-image pairs using **Italian Bert** model as text encoder and Vision Transformer **ViT** as image encoder using the **JAX/Flax** neural network library. The training was carried out during the **Hugging Face** Community event on **Google's TPU** machines, sponsored by **Google Cloud**.
-Clip-Italian (Contrastive Language-Image Pre-training in Italian language) is based on OpenAI’s CLIP ([Radford et al., 2021](https://arxiv.org/abs/2103.00020))which is an amazing model that can learn to represent images and text jointly in the same space.
 In this project, we aim to propose the first CLIP model trained on Italian data, that in this context can be considered a
-low resource language. Using a few techniques, we have been able to fine-tune a SOTA Italian CLIP model with **only 1.4M** training samples. Our Italian CLIP model
 is built upon the pre-trained [Italian BERT](https://huggingface.co/dbmdz/bert-base-italian-xxl-cased) model provided by [dbmdz](https://huggingface.co/dbmdz) and the OpenAI
 [vision transformer](https://huggingface.co/openai/clip-vit-base-patch32).

+# Italian CLIP
+CLIP ([Radford et al., 2021](https://arxiv.org/abs/2103.00020)) is an amazing model that can learn to represent images and text jointly in the same space.
 In this project, we aim to propose the first CLIP model trained on Italian data, that in this context can be considered a
+low resource language. Using a few techniques, we have been able to fine-tune a SOTA Italian CLIP model with **only 1.4 million** training samples. Our Italian CLIP model
 is built upon the pre-trained [Italian BERT](https://huggingface.co/dbmdz/bert-base-italian-xxl-cased) model provided by [dbmdz](https://huggingface.co/dbmdz) and the OpenAI
 [vision transformer](https://huggingface.co/openai/clip-vit-base-patch32).

text2image.py CHANGED Viewed

@@ -107,21 +107,21 @@ headers = {
 def app():
-    #st.title("From Text to Image")
-    st.markdown("<h1 style='text-align: center; color: #CD212A;'> Image Retrieval </h1>", unsafe_allow_html=True)
-    st.markdown("<h2 style='text-align: center; color: #008C45; font-weight:bold;'> Text to Image </h2>", unsafe_allow_html=True)
     st.markdown(
         """
-        👋 Ciao! Here you can type Italian query and search from ~150k images in the Conceptual Captions (CC) dataset or  25k Photos in the Unsplash dataset.
-        Though these images were not used for training the model, you will see most queries make sense.
-        Rare errors might be due to 2 possibilities:
-        a)The model is answering in a wrong way or b) the image you are looking for are not in the dataset & the model is giving you the best answer it can get.
-        You can choose from one of the following examples :
         """
     )
@@ -157,7 +157,7 @@ def app():
     col1, col2 = st.beta_columns([3, 1])
     with col1:
-        query = st.text_input("OR INSERT AN ITALIAN QUERY TEXT : ")
     with col2:
         dataset_name = st.selectbox("IR dataset", ["CC", "Unsplash"])
@@ -200,8 +200,7 @@ def app():
                 break
             except (UnidentifiedImageError) as e:
                 if i == N - 1:
-                    st.text(f'Tried to show  {N} different image URLS but none of them were reachabele.\
-Maybe try a different query?')
         gc.collect()

 def app():
+    st.title("From Text to Image")
     st.markdown(
         """
+        ### 👋 Ciao!
+        Here you can search for ~150.000 images in the Conceptual Captions dataset (CC) or in the Unsplash 25k Photos dataset.
+        Even though we did not train on any of these images you will see most queries make sense. When you see errors, there might be two possibilities:
+        the model is answering in a wrong way or the image you are looking for is not in the dataset and the model is giving you the best answer it can get.
+        🤌 Italian mode on! 🤌
+        You can choose from one of the following examples:
         """
     )
     col1, col2 = st.beta_columns([3, 1])
     with col1:
+        query = st.text_input("... or insert an Italian query text")
     with col2:
         dataset_name = st.selectbox("IR dataset", ["CC", "Unsplash"])
                 break
             except (UnidentifiedImageError) as e:
                 if i == N - 1:
+                    st.text(f'Tried to show {N} different image URLS but none of them were reachabele.\nMaybe try a different query?')
         gc.collect()